diff --git a/build.xml b/build.xml index 18ab3e684..a93918ec8 100644 --- a/build.xml +++ b/build.xml @@ -899,7 +899,7 @@ - + @@ -1023,6 +1023,24 @@ + + + + + + + + + + + + + + + + + + @@ -1186,14 +1204,16 @@ + + listeners="org.testng.reporters.FailedReporter,org.testng.reporters.JUnitXMLReporter,org.broadinstitute.sting.TestNGTestTransformer,org.broadinstitute.sting.StingTextReporter,org.uncommons.reportng.HTMLReporter"> diff --git a/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java b/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java index 59357e1c4..a61614481 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtils.java @@ -67,7 +67,6 @@ public class AlleleBiasedDownsamplingUtils { alleleStratifiedElements[baseIndex].add(pe); } - // Down-sample *each* allele by the contamination fraction applied to the entire pileup. // Unfortunately, we need to maintain the original pileup ordering of reads or FragmentUtils will complain later. int numReadsToRemove = (int)(pileup.getNumberOfElements() * downsamplingFraction); // floor final TreeSet elementsToKeep = new TreeSet(new Comparator() { @@ -78,12 +77,21 @@ public class AlleleBiasedDownsamplingUtils { } }); + // make a listing of allele counts + final int[] alleleCounts = new int[4]; + for ( int i = 0; i < 4; i++ ) + alleleCounts[i] = alleleStratifiedElements[i].size(); + + // do smart down-sampling + final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove); + for ( int i = 0; i < 4; i++ ) { final ArrayList alleleList = alleleStratifiedElements[i]; - if ( alleleList.size() <= numReadsToRemove ) - logAllElements(alleleList, log); + // if we don't need to remove any reads, keep them all + if ( alleleList.size() <= targetAlleleCounts[i] ) + elementsToKeep.addAll(alleleList); else - elementsToKeep.addAll(downsampleElements(alleleList, numReadsToRemove, log)); + elementsToKeep.addAll(downsampleElements(alleleList, alleleList.size() - targetAlleleCounts[i], log)); } // clean up pointers so memory can be garbage collected if needed @@ -93,6 +101,66 @@ public class AlleleBiasedDownsamplingUtils { return new ReadBackedPileupImpl(pileup.getLocation(), new ArrayList(elementsToKeep)); } + private static int scoreAlleleCounts(final int[] alleleCounts) { + if ( alleleCounts.length < 2 ) + return 0; + + // sort the counts (in ascending order) + final int[] alleleCountsCopy = alleleCounts.clone(); + Arrays.sort(alleleCountsCopy); + + final int maxCount = alleleCountsCopy[alleleCounts.length - 1]; + final int nextBestCount = alleleCountsCopy[alleleCounts.length - 2]; + + int remainderCount = 0; + for ( int i = 0; i < alleleCounts.length - 2; i++ ) + remainderCount += alleleCountsCopy[i]; + + // try to get the best score: + // - in the het case the counts should be equal with nothing else + // - in the hom case the non-max should be zero + return Math.min(maxCount - nextBestCount + remainderCount, Math.abs(nextBestCount + remainderCount)); + } + + /** + * Computes an allele biased version of the given pileup + * + * @param alleleCounts the original pileup + * @param numReadsToRemove fraction of total reads to remove per allele + * @return allele biased pileup + */ + protected static int[] runSmartDownsampling(final int[] alleleCounts, final int numReadsToRemove) { + final int numAlleles = alleleCounts.length; + + int maxScore = scoreAlleleCounts(alleleCounts); + int[] alleleCountsOfMax = alleleCounts; + + final int numReadsToRemovePerAllele = numReadsToRemove / 2; + + for ( int i = 0; i < numAlleles; i++ ) { + for ( int j = i; j < numAlleles; j++ ) { + final int[] newCounts = alleleCounts.clone(); + + // split these cases so we don't lose on the floor (since we divided by 2) + if ( i == j ) { + newCounts[i] = Math.max(0, newCounts[i] - numReadsToRemove); + } else { + newCounts[i] = Math.max(0, newCounts[i] - numReadsToRemovePerAllele); + newCounts[j] = Math.max(0, newCounts[j] - numReadsToRemovePerAllele); + } + + final int score = scoreAlleleCounts(newCounts); + + if ( score < maxScore ) { + maxScore = score; + alleleCountsOfMax = newCounts; + } + } + } + + return alleleCountsOfMax; + } + /** * Performs allele biased down-sampling on a pileup and computes the list of elements to keep * @@ -102,7 +170,15 @@ public class AlleleBiasedDownsamplingUtils { * @return the list of pileup elements TO KEEP */ private static List downsampleElements(final ArrayList elements, final int numElementsToRemove, final PrintStream log) { + if ( numElementsToRemove == 0 ) + return elements; + final int pileupSize = elements.size(); + if ( numElementsToRemove == pileupSize ) { + logAllElements(elements, log); + return new ArrayList(0); + } + final BitSet itemsToRemove = new BitSet(pileupSize); for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) { itemsToRemove.set(selectedIndex); @@ -132,15 +208,25 @@ public class AlleleBiasedDownsamplingUtils { for ( final List reads : alleleReadMap.values() ) totalReads += reads.size(); - // Down-sample *each* allele by the contamination fraction applied to the entire pileup. int numReadsToRemove = (int)(totalReads * downsamplingFraction); - final List readsToRemove = new ArrayList(numReadsToRemove * alleleReadMap.size()); - for ( final List reads : alleleReadMap.values() ) { - if ( reads.size() <= numReadsToRemove ) { - readsToRemove.addAll(reads); - logAllReads(reads, log); - } else { - readsToRemove.addAll(downsampleReads(reads, numReadsToRemove, log)); + + // make a listing of allele counts + final List alleles = new ArrayList(alleleReadMap.keySet()); + alleles.remove(Allele.NO_CALL); // ignore the no-call bin + final int numAlleles = alleles.size(); + final int[] alleleCounts = new int[numAlleles]; + for ( int i = 0; i < numAlleles; i++ ) + alleleCounts[i] = alleleReadMap.get(alleles.get(i)).size(); + + // do smart down-sampling + final int[] targetAlleleCounts = runSmartDownsampling(alleleCounts, numReadsToRemove); + + final List readsToRemove = new ArrayList(numReadsToRemove); + for ( int i = 0; i < numAlleles; i++ ) { + final List alleleBin = alleleReadMap.get(alleles.get(i)); + + if ( alleleBin.size() > targetAlleleCounts[i] ) { + readsToRemove.addAll(downsampleReads(alleleBin, alleleBin.size() - targetAlleleCounts[i], log)); } } @@ -156,13 +242,22 @@ public class AlleleBiasedDownsamplingUtils { * @return the list of pileup elements TO REMOVE */ private static List downsampleReads(final List reads, final int numElementsToRemove, final PrintStream log) { + final ArrayList readsToRemove = new ArrayList(numElementsToRemove); + + if ( numElementsToRemove == 0 ) + return readsToRemove; + final int pileupSize = reads.size(); + if ( numElementsToRemove == pileupSize ) { + logAllReads(reads, log); + return reads; + } + final BitSet itemsToRemove = new BitSet(pileupSize); for ( Integer selectedIndex : MathUtils.sampleIndicesWithoutReplacement(pileupSize, numElementsToRemove) ) { itemsToRemove.set(selectedIndex); } - ArrayList readsToRemove = new ArrayList(pileupSize - numElementsToRemove); for ( int i = 0; i < pileupSize; i++ ) { if ( itemsToRemove.get(i) ) { final GATKSAMRecord read = reads.get(i); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index 3cdf3d75e..629a27c48 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -253,7 +253,6 @@ public class ReduceReads extends ReadWalker, ReduceRea intervalList.addAll(toolkit.getIntervals()); - // todo -- rework the whole NO_PG_TAG thing final boolean preSorted = true; final boolean indexOnTheFly = true; final boolean keep_records = true; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 24a3ba3cb..fff1c20a5 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -220,7 +220,6 @@ public class SlidingWindow { regions = findVariantRegions(0, breakpoint, markedSites.getVariantSiteBitSet(), !forceClose); } - // todo -- can be more aggressive here removing until the NEW window header start location after closing the variant regions while (!readsInWindow.isEmpty() && readsInWindow.first().getSoftEnd() < windowHeaderStartLocation) { readsInWindow.pollFirst(); } @@ -607,9 +606,7 @@ public class SlidingWindow { toRemove.add(read); } } - for (GATKSAMRecord read : toRemove) { - readsInWindow.remove(read); - } + removeReadsFromWindow(toRemove); } return allReads; } @@ -805,9 +802,8 @@ public class SlidingWindow { hetReads.add(finalizeRunningConsensus()); } - for (GATKSAMRecord read : toRemove) { - readsInWindow.remove(read); - } + removeReadsFromWindow(toRemove); + return hetReads; } @@ -924,5 +920,11 @@ public class SlidingWindow { } } } + + private void removeReadsFromWindow (List readsToRemove) { + for (GATKSAMRecord read : readsToRemove) { + readsInWindow.remove(read); + } + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index d91df82e2..4fc2dc8f7 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -31,7 +31,6 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; -import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -42,163 +41,26 @@ import java.util.*; public class GenotypingEngine { private final boolean DEBUG; - private final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE; private final static List noCall = new ArrayList(); // used to noCall all genotypes until the exact model is applied private final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("", false); - public GenotypingEngine( final boolean DEBUG, final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) { + public GenotypingEngine( final boolean DEBUG ) { this.DEBUG = DEBUG; - this.OUTPUT_FULL_HAPLOTYPE_SEQUENCE = OUTPUT_FULL_HAPLOTYPE_SEQUENCE; noCall.add(Allele.NO_CALL); } - // WARN - // This function is the streamlined approach, currently not being used by default - // WARN - // WARN: This function is currently only being used by Menachem. Slated for removal/merging with the rest of the code. - // WARN - @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) - public List>>> assignGenotypeLikelihoodsAndCallHaplotypeEvents( final UnifiedGenotyperEngine UG_engine, - final ArrayList haplotypes, - final byte[] ref, - final GenomeLoc refLoc, - final GenomeLoc activeRegionWindow, - final GenomeLocParser genomeLocParser ) { - // Prepare the list of haplotype indices to genotype - final ArrayList allelesToGenotype = new ArrayList(); - - for( final Haplotype h : haplotypes ) { - allelesToGenotype.add( Allele.create(h.getBases(), h.isReference()) ); - } - final int numHaplotypes = haplotypes.size(); - - // Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample - final GenotypesContext genotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size()); - for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples - final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2]; - final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(haplotypes, sample); - int glIndex = 0; - for( int iii = 0; iii < numHaplotypes; iii++ ) { - for( int jjj = 0; jjj <= iii; jjj++ ) { - genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC - } - } - genotypes.add(new GenotypeBuilder(sample, noCall).PL(genotypeLikelihoods).make()); - } - final VariantCallContext call = UG_engine.calculateGenotypes(new VariantContextBuilder().loc(activeRegionWindow).alleles(allelesToGenotype).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel); - if( call == null ) { return Collections.emptyList(); } // exact model says that the call confidence is below the specified confidence threshold so nothing to do here - - // Prepare the list of haplotypes that need to be run through Smith-Waterman for output to VCF - final ArrayList haplotypesToRemove = new ArrayList(); - for( final Haplotype h : haplotypes ) { - if( call.getAllele(h.getBases()) == null ) { // exact model removed this allele from the list so no need to run SW and output to VCF - haplotypesToRemove.add(h); - } - } - haplotypes.removeAll(haplotypesToRemove); - - if( OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) { - final List>>> returnVCs = new ArrayList>>>(); - // set up the default 1-to-1 haplotype mapping object - final HashMap> haplotypeMapping = new HashMap>(); - for( final Haplotype h : haplotypes ) { - final ArrayList list = new ArrayList(); - list.add(h); - haplotypeMapping.put(call.getAllele(h.getBases()), list); - } - returnVCs.add( new Pair>>(call,haplotypeMapping) ); - return returnVCs; - } - - final ArrayList>>> returnCalls = new ArrayList>>>(); - - // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file - final TreeSet startPosKeySet = new TreeSet(); - int count = 0; - if( DEBUG ) { System.out.println("=== Best Haplotypes ==="); } - for( final Haplotype h : haplotypes ) { - if( DEBUG ) { - System.out.println( h.toString() ); - System.out.println( "> Cigar = " + h.getCigar() ); - } - // Walk along the alignment and turn any difference from the reference into an event - h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) ); - startPosKeySet.addAll(h.getEventMap().keySet()); - } - - // Create the VC merge priority list - final ArrayList priorityList = new ArrayList(); - for( int iii = 0; iii < haplotypes.size(); iii++ ) { - priorityList.add("HC" + iii); - } - - // Walk along each position in the key set and create each event to be outputted - for( final int loc : startPosKeySet ) { - if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { - final ArrayList eventsAtThisLoc = new ArrayList(); - for( final Haplotype h : haplotypes ) { - final HashMap eventMap = h.getEventMap(); - final VariantContext vc = eventMap.get(loc); - if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) { - eventsAtThisLoc.add(vc); - } - } - - // Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event - final ArrayList> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes ); - - // Merge the event to find a common reference representation - final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); - - final HashMap> alleleHashMap = new HashMap>(); - int aCount = 0; - for( final Allele a : mergedVC.getAlleles() ) { - alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper - } - - if( DEBUG ) { - System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); - //System.out.println("Event/haplotype allele mapping = " + alleleMapper); - } - - // Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample - final GenotypesContext myGenotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size()); - for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples - final int myNumHaplotypes = alleleMapper.size(); - final double[] genotypeLikelihoods = new double[myNumHaplotypes * (myNumHaplotypes+1) / 2]; - final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper); - int glIndex = 0; - for( int iii = 0; iii < myNumHaplotypes; iii++ ) { - for( int jjj = 0; jjj <= iii; jjj++ ) { - genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC - } - } - - // using the allele mapping object translate the haplotype allele into the event allele - final Genotype g = new GenotypeBuilder(sample) - .alleles(findEventAllelesInSample(mergedVC.getAlleles(), call.getAlleles(), call.getGenotype(sample).getAlleles(), alleleMapper, haplotypes)) - .phased(loc != startPosKeySet.first()) - .PL(genotypeLikelihoods).make(); - myGenotypes.add(g); - } - returnCalls.add( new Pair>>( - new VariantContextBuilder(mergedVC).log10PError(call.getLog10PError()).genotypes(myGenotypes).make(), alleleHashMap) ); - } - } - return returnCalls; - } - // BUGBUG: Create a class to hold this complicated return type @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) - public List>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine, - final ArrayList haplotypes, - final byte[] ref, - final GenomeLoc refLoc, - final GenomeLoc activeRegionWindow, - final GenomeLocParser genomeLocParser, - final ArrayList activeAllelesToGenotype ) { + public List>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine, + final List haplotypes, + final byte[] ref, + final GenomeLoc refLoc, + final GenomeLoc activeRegionWindow, + final GenomeLocParser genomeLocParser, + final List activeAllelesToGenotype ) { - final ArrayList>>> returnCalls = new ArrayList>>>(); + final ArrayList>>> returnCalls = new ArrayList>>>(); + final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty(); // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file final TreeSet startPosKeySet = new TreeSet(); @@ -207,7 +69,7 @@ public class GenotypingEngine { for( final Haplotype h : haplotypes ) { // Walk along the alignment and turn any difference from the reference into an event h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) ); - if( activeAllelesToGenotype.isEmpty() ) { startPosKeySet.addAll(h.getEventMap().keySet()); } + if( !in_GGA_mode ) { startPosKeySet.addAll(h.getEventMap().keySet()); } if( DEBUG ) { System.out.println( h.toString() ); System.out.println( "> Cigar = " + h.getCigar() ); @@ -217,10 +79,10 @@ public class GenotypingEngine { } cleanUpSymbolicUnassembledEvents( haplotypes ); - if( activeAllelesToGenotype.isEmpty() && haplotypes.get(0).getSampleKeySet().size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure + if( !in_GGA_mode && haplotypes.get(0).getSampleKeySet().size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure mergeConsecutiveEventsBasedOnLD( haplotypes, startPosKeySet, ref, refLoc ); } - if( !activeAllelesToGenotype.isEmpty() ) { // we are in GGA mode! + if( in_GGA_mode ) { for( final VariantContext compVC : activeAllelesToGenotype ) { startPosKeySet.add( compVC.getStart() ); } @@ -232,7 +94,7 @@ public class GenotypingEngine { final ArrayList eventsAtThisLoc = new ArrayList(); // the overlapping events to merge into a common reference view final ArrayList priorityList = new ArrayList(); // used to merge overlapping events into common reference view - if( activeAllelesToGenotype.isEmpty() ) { + if( !in_GGA_mode ) { for( final Haplotype h : haplotypes ) { final HashMap eventMap = h.getEventMap(); final VariantContext vc = eventMap.get(loc); @@ -261,7 +123,14 @@ public class GenotypingEngine { if( eventsAtThisLoc.isEmpty() ) { continue; } // Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event - final ArrayList> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes ); + Map> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes ); + + final Allele refAllele = eventsAtThisLoc.get(0).getReference(); + final ArrayList alleleOrdering = new ArrayList(alleleMapper.size()); + alleleOrdering.add(refAllele); + for( final VariantContext vc : eventsAtThisLoc ) { + alleleOrdering.add(vc.getAlternateAllele(0)); + } // Sanity check the priority list for( final VariantContext vc : eventsAtThisLoc ) { @@ -283,11 +152,15 @@ public class GenotypingEngine { final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); if( mergedVC == null ) { continue; } - HashMap> alleleHashMap = new HashMap>(); - int aCount = 0; - for( final Allele a : mergedVC.getAlleles() ) { - alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper + // let's update the Allele keys in the mapper because they can change after merging when there are complex events + Map> updatedAlleleMapper = new HashMap>(alleleMapper.size()); + for ( int i = 0; i < mergedVC.getNAlleles(); i++ ) { + final Allele oldAllele = alleleOrdering.get(i); + final Allele newAllele = mergedVC.getAlleles().get(i); + updatedAlleleMapper.put(newAllele, alleleMapper.get(oldAllele)); + alleleOrdering.set(i, newAllele); } + alleleMapper = updatedAlleleMapper; if( DEBUG ) { System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); @@ -299,7 +172,7 @@ public class GenotypingEngine { for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples final int numHaplotypes = alleleMapper.size(); final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2]; - final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper); + final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper, alleleOrdering); int glIndex = 0; for( int iii = 0; iii < numHaplotypes; iii++ ) { for( int jjj = 0; jjj <= iii; jjj++ ) { @@ -313,23 +186,23 @@ public class GenotypingEngine { if( call.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary! final VariantContext vcCallTrim = VariantContextUtils.reverseTrimAlleles(call); // also, need to update the allele -> haplotype mapping - final HashMap> alleleHashMapTrim = new HashMap>(); + final HashMap> alleleHashMapTrim = new HashMap>(); for( int iii = 0; iii < vcCallTrim.getAlleles().size(); iii++ ) { // BUGBUG: this is assuming that the original and trimmed alleles maintain the same ordering in the VC - alleleHashMapTrim.put(vcCallTrim.getAlleles().get(iii), alleleHashMap.get(call.getAlleles().get(iii))); + alleleHashMapTrim.put(vcCallTrim.getAlleles().get(iii), alleleMapper.get(call.getAlleles().get(iii))); } call = vcCallTrim; - alleleHashMap = alleleHashMapTrim; + alleleMapper = alleleHashMapTrim; } - returnCalls.add( new Pair>>(call, alleleHashMap) ); + returnCalls.add( new Pair>>(call, alleleMapper) ); } } } return returnCalls; } - protected static void cleanUpSymbolicUnassembledEvents( final ArrayList haplotypes ) { + protected static void cleanUpSymbolicUnassembledEvents( final List haplotypes ) { final ArrayList haplotypesToRemove = new ArrayList(); for( final Haplotype h : haplotypes ) { for( final VariantContext vc : h.getEventMap().values() ) { @@ -348,7 +221,7 @@ public class GenotypingEngine { haplotypes.removeAll(haplotypesToRemove); } - protected void mergeConsecutiveEventsBasedOnLD( final ArrayList haplotypes, final TreeSet startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) { + protected void mergeConsecutiveEventsBasedOnLD( final List haplotypes, final TreeSet startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) { final int MAX_SIZE_TO_COMBINE = 15; final double MERGE_EVENTS_R2_THRESHOLD = 0.95; if( startPosKeySet.size() <= 1 ) { return; } @@ -395,7 +268,9 @@ public class GenotypingEngine { final ArrayList haplotypeList = new ArrayList(); haplotypeList.add(h); for( final String sample : haplotypes.get(0).getSampleKeySet() ) { - final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( haplotypeList, sample )[0][0]; + final HashSet sampleSet = new HashSet(1); + sampleSet.add(sample); + final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( sampleSet, haplotypeList )[0][0]; if( thisHapVC == null ) { if( nextHapVC == null ) { x11 = MathUtils.approximateLog10SumLog10(x11, haplotypeLikelihood); } else { x12 = MathUtils.approximateLog10SumLog10(x12, haplotypeLikelihood); } @@ -489,37 +364,87 @@ public class GenotypingEngine { @Requires({"haplotypes.size() >= eventsAtThisLoc.size() + 1"}) @Ensures({"result.size() == eventsAtThisLoc.size() + 1"}) - protected static ArrayList> createAlleleMapper( final int loc, final ArrayList eventsAtThisLoc, final ArrayList haplotypes ) { - final ArrayList> alleleMapper = new ArrayList>(); - final ArrayList refList = new ArrayList(); + protected static Map> createAlleleMapper( final int loc, final List eventsAtThisLoc, final List haplotypes ) { + + final Map> alleleMapper = new HashMap>(eventsAtThisLoc.size()+1); + final Allele refAllele = eventsAtThisLoc.get(0).getReference(); + alleleMapper.put(refAllele, new ArrayList()); + for( final VariantContext vc : eventsAtThisLoc ) + alleleMapper.put(vc.getAlternateAllele(0), new ArrayList()); + + final ArrayList undeterminedHaplotypes = new ArrayList(haplotypes.size()); for( final Haplotype h : haplotypes ) { if( h.getEventMap().get(loc) == null ) { // no event at this location so this is a reference-supporting haplotype - refList.add(h); + alleleMapper.get(refAllele).add(h); + } else if( h.isArtificialHaplotype() && loc == h.getArtificialAllelePosition() && alleleMapper.containsKey(h.getArtificialAllele()) ) { + alleleMapper.get(h.getArtificialAllele()).add(h); } else { - boolean foundInEventList = false; + boolean haplotypeIsDetermined = false; for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) { if( h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) { - foundInEventList = true; + alleleMapper.get(vcAtThisLoc.getAlternateAllele(0)).add(h); + haplotypeIsDetermined = true; + break; } } - if( !foundInEventList ) { // event at this location isn't one of the genotype-able options (during GGA) so this is a reference-supporting haplotype - refList.add(h); - } + + if( !haplotypeIsDetermined ) + undeterminedHaplotypes.add(h); } } - alleleMapper.add(refList); - for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) { - final ArrayList list = new ArrayList(); - for( final Haplotype h : haplotypes ) { - if( h.getEventMap().get(loc) != null && h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) { - list.add(h); + + for( final Haplotype h : undeterminedHaplotypes ) { + Allele matchingAllele = null; + for( final Map.Entry> alleleToTest : alleleMapper.entrySet() ) { + // don't test against the reference allele + if( alleleToTest.getKey().equals(refAllele) ) + continue; + + final Haplotype artificialHaplotype = alleleToTest.getValue().get(0); + if( isSubSetOf(artificialHaplotype.getEventMap(), h.getEventMap(), true) ) { + matchingAllele = alleleToTest.getKey(); + break; } } - alleleMapper.add(list); + + if( matchingAllele == null ) + matchingAllele = refAllele; + alleleMapper.get(matchingAllele).add(h); } + return alleleMapper; } + protected static boolean isSubSetOf(final Map subset, final Map superset, final boolean resolveSupersetToSubset) { + + for ( final Map.Entry fromSubset : subset.entrySet() ) { + final VariantContext fromSuperset = superset.get(fromSubset.getKey()); + if ( fromSuperset == null ) + return false; + + List supersetAlleles = fromSuperset.getAlternateAlleles(); + if ( resolveSupersetToSubset ) + supersetAlleles = resolveAlternateAlleles(fromSubset.getValue().getReference(), fromSuperset.getReference(), supersetAlleles); + + if ( !supersetAlleles.contains(fromSubset.getValue().getAlternateAllele(0)) ) + return false; + } + + return true; + } + + private static List resolveAlternateAlleles(final Allele targetReference, final Allele actualReference, final List currentAlleles) { + if ( targetReference.length() <= actualReference.length() ) + return currentAlleles; + + final List newAlleles = new ArrayList(currentAlleles.size()); + final byte[] extraBases = Arrays.copyOfRange(targetReference.getBases(), actualReference.length(), targetReference.length()); + for ( final Allele a : currentAlleles ) { + newAlleles.add(Allele.extend(a, extraBases)); + } + return newAlleles; + } + @Ensures({"result.size() == haplotypeAllelesForSample.size()"}) protected static List findEventAllelesInSample( final List eventAlleles, final List haplotypeAlleles, final List haplotypeAllelesForSample, final ArrayList> alleleMapper, final ArrayList haplotypes ) { if( haplotypeAllelesForSample.contains(Allele.NO_CALL) ) { return noCall; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 5aba23faa..24b3309f1 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -26,7 +26,6 @@ package org.broadinstitute.sting.gatk.walkers.haplotypecaller; import com.google.java.contract.Ensures; -import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.commandline.*; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; @@ -41,7 +40,10 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.gatk.walkers.annotator.VariantAnnotatorEngine; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; -import org.broadinstitute.sting.gatk.walkers.genotyper.*; +import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeLikelihoodsCalculationModel; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection; +import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; +import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; import org.broadinstitute.sting.utils.clipping.ReadClipper; @@ -129,14 +131,6 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false) protected int MIN_PRUNE_FACTOR = 1; - @Advanced - @Argument(fullName="genotypeFullActiveRegion", shortName="genotypeFullActiveRegion", doc = "If specified, alternate alleles are considered to be the full active region for the purposes of genotyping", required = false) - protected boolean GENOTYPE_FULL_ACTIVE_REGION = false; - - @Advanced - @Argument(fullName="fullHaplotype", shortName="fullHaplotype", doc = "If specified, output the full haplotype sequence instead of converting to individual variants w.r.t. the reference", required = false) - protected boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE = false; - @Advanced @Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false) protected int gcpHMM = 10; @@ -212,7 +206,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem private VariantAnnotatorEngine annotationEngine; // fasta reference reader to supplement the edges of the reference sequence - private IndexedFastaSequenceFile referenceReader; + private CachingIndexedFastaSequenceFile referenceReader; // reference base padding size private static final int REFERENCE_PADDING = 900; @@ -246,10 +240,11 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC); - simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling - simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling - simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); - simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); + simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; + simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; + simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.CONTAMINATION_FRACTION = 0.0; simpleUAC.exactCallsLog = null; UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); @@ -271,15 +266,6 @@ public class HaplotypeCaller extends ActiveRegionWalker implem VCFConstants.GENOTYPE_QUALITY_KEY, VCFConstants.DEPTH_KEY, VCFConstants.GENOTYPE_PL_KEY); - // header lines for the experimental HaplotypeCaller-specific annotations - headerInfo.add(new VCFInfoHeaderLine("NVH", 1, VCFHeaderLineType.Integer, "Number of variants found on the haplotype that contained this variant")); - headerInfo.add(new VCFInfoHeaderLine("NumHapEval", 1, VCFHeaderLineType.Integer, "Number of haplotypes that were chosen for evaluation in this active region")); - headerInfo.add(new VCFInfoHeaderLine("NumHapAssembly", 1, VCFHeaderLineType.Integer, "Number of haplotypes created during the assembly of this active region")); - headerInfo.add(new VCFInfoHeaderLine("ActiveRegionSize", 1, VCFHeaderLineType.Integer, "Number of base pairs that comprise this active region")); - headerInfo.add(new VCFInfoHeaderLine("EVENTLENGTH", 1, VCFHeaderLineType.Integer, "Max length of all the alternate alleles")); - headerInfo.add(new VCFInfoHeaderLine("TYPE", 1, VCFHeaderLineType.String, "Type of event: SNP or INDEL")); - headerInfo.add(new VCFInfoHeaderLine("extType", 1, VCFHeaderLineType.String, "Extended type of event: SNP, MNP, INDEL, or COMPLEX")); - headerInfo.add(new VCFInfoHeaderLine("QDE", 1, VCFHeaderLineType.Float, "QD value divided by the number of variants found on the haplotype that contained this variant")); // FILTER fields are added unconditionally as it's not always 100% certain the circumstances // where the filters are used. For example, in emitting all sites the lowQual field is used @@ -296,7 +282,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter ); likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); - genotypingEngine = new GenotypingEngine( DEBUG, OUTPUT_FULL_HAPLOTYPE_SEQUENCE ); + genotypingEngine = new GenotypingEngine( DEBUG ); } //--------------------------------------------------------------------------------------------------------------- @@ -324,15 +310,15 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } } if( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ) { - return new ActivityProfileResult(1.0); + return new ActivityProfileResult(ref.getLocus(), 1.0); } } if( USE_ALLELES_TRIGGER ) { - return new ActivityProfileResult( tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 ); + return new ActivityProfileResult( ref.getLocus(), tracker.getValues(UG_engine.getUAC().alleles, ref.getLocus()).size() > 0 ? 1.0 : 0.0 ); } - if( context == null ) { return new ActivityProfileResult(0.0); } + if( context == null ) { return new ActivityProfileResult(ref.getLocus(), 0.0); } final List noCall = new ArrayList(); // used to noCall all genotypes until the exact model is applied noCall.add(Allele.NO_CALL); @@ -369,7 +355,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.INDEL); final double isActiveProb = vcOut == null ? 0.0 : QualityUtils.qualToProb( vcOut.getPhredScaledQual() ); - return new ActivityProfileResult( isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileResult.ActivityProfileResultState.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileResult.ActivityProfileResultState.NONE, averageHQSoftClips.mean() ); + return new ActivityProfileResult( ref.getLocus(), isActiveProb, averageHQSoftClips.mean() > 6.0 ? ActivityProfileResult.ActivityProfileResultState.HIGH_QUALITY_SOFT_CLIPS : ActivityProfileResult.ActivityProfileResultState.NONE, averageHQSoftClips.mean() ); } //--------------------------------------------------------------------------------------------------------------- @@ -419,52 +405,13 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes ) final ArrayList bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? likelihoodCalculationEngine.selectBestHaplotypes( haplotypes ) : haplotypes ); - for( final Pair>> callResult : - ( GENOTYPE_FULL_ACTIVE_REGION && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES - ? genotypingEngine.assignGenotypeLikelihoodsAndCallHaplotypeEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getExtendedLoc(), getToolkit().getGenomeLocParser() ) - : genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) ) { + for( final Pair>> callResult : + genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) { if( DEBUG ) { System.out.println(callResult.getFirst().toStringWithoutGenotypes()); } final Map stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog ); final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst()); final Map myAttributes = new LinkedHashMap(annotatedCall.getAttributes()); - - if( !GENOTYPE_FULL_ACTIVE_REGION ) { - // add some custom annotations to the calls - - // Calculate the number of variants on the haplotype - int maxNumVar = 0; - for( final Allele allele : callResult.getFirst().getAlleles() ) { - if( !allele.isReference() ) { - for( final Haplotype haplotype : callResult.getSecond().get(allele) ) { - final int numVar = haplotype.getEventMap().size(); - if( numVar > maxNumVar ) { maxNumVar = numVar; } - } - } - } - // Calculate the event length - int maxLength = 0; - for ( final Allele a : annotatedCall.getAlternateAlleles() ) { - final int length = a.length() - annotatedCall.getReference().length(); - if( Math.abs(length) > Math.abs(maxLength) ) { maxLength = length; } - } - - myAttributes.put("NVH", maxNumVar); - myAttributes.put("NumHapEval", bestHaplotypes.size()); - myAttributes.put("NumHapAssembly", haplotypes.size()); - myAttributes.put("ActiveRegionSize", activeRegion.getLocation().size()); - myAttributes.put("EVENTLENGTH", maxLength); - myAttributes.put("TYPE", (annotatedCall.isSNP() || annotatedCall.isMNP() ? "SNP" : "INDEL") ); - myAttributes.put("extType", annotatedCall.getType().toString() ); - - //if( likelihoodCalculationEngine.haplotypeScore != null ) { - // myAttributes.put("HaplotypeScore", String.format("%.4f", likelihoodCalculationEngine.haplotypeScore)); - //} - if( annotatedCall.hasAttribute("QD") ) { - myAttributes.put("QDE", String.format("%.2f", Double.parseDouble((String)annotatedCall.getAttribute("QD")) / ((double)maxNumVar)) ); - } - } - vcfWriter.add( new VariantContextBuilder(annotatedCall).attributes(myAttributes).make() ); } @@ -520,6 +467,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) { final GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY ); // protect against INTERVALS with abnormally high coverage + // BUGBUG: remove when positinal downsampler is hooked up to ART/HC if( clippedRead.getReadLength() > 0 && activeRegion.size() < samplesList.size() * DOWNSAMPLE_PER_SAMPLE_PER_REGION ) { activeRegion.add(clippedRead); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index a0924623b..29622ca17 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -148,45 +148,31 @@ public class LikelihoodCalculationEngine { return Math.min(b1.length, b2.length); } - @Requires({"haplotypes.size() > 0"}) - @Ensures({"result.length == result[0].length", "result.length == haplotypes.size()"}) - public static double[][] computeDiploidHaplotypeLikelihoods( final ArrayList haplotypes, final String sample ) { - // set up the default 1-to-1 haplotype mapping object, BUGBUG: target for future optimization? - final ArrayList> haplotypeMapping = new ArrayList>(); - for( final Haplotype h : haplotypes ) { - final ArrayList list = new ArrayList(); - list.add(h); - haplotypeMapping.add(list); - } - return computeDiploidHaplotypeLikelihoods( sample, haplotypeMapping ); - } - // This function takes just a single sample and a haplotypeMapping @Requires({"haplotypeMapping.size() > 0"}) @Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"}) - public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, final ArrayList> haplotypeMapping ) { + public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, final Map> haplotypeMapping, final List alleleOrdering ) { final TreeSet sampleSet = new TreeSet(); sampleSet.add(sample); - return computeDiploidHaplotypeLikelihoods(sampleSet, haplotypeMapping); + return computeDiploidHaplotypeLikelihoods(sampleSet, haplotypeMapping, alleleOrdering); } // This function takes a set of samples to pool over and a haplotypeMapping @Requires({"haplotypeMapping.size() > 0"}) @Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"}) - public static double[][] computeDiploidHaplotypeLikelihoods( final Set samples, final ArrayList> haplotypeMapping ) { + public static double[][] computeDiploidHaplotypeLikelihoods( final Set samples, final Map> haplotypeMapping, final List alleleOrdering ) { - final int numHaplotypes = haplotypeMapping.size(); + final int numHaplotypes = alleleOrdering.size(); final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes]; for( int iii = 0; iii < numHaplotypes; iii++ ) { Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY); } // compute the diploid haplotype likelihoods - // todo - needs to be generalized to arbitrary ploidy, cleaned and merged with PairHMMIndelErrorModel code for( int iii = 0; iii < numHaplotypes; iii++ ) { - for( int jjj = 0; jjj <= iii; jjj++ ) { - for( final Haplotype iii_mapped : haplotypeMapping.get(iii) ) { - for( final Haplotype jjj_mapped : haplotypeMapping.get(jjj) ) { + for( int jjj = 0; jjj <= iii; jjj++ ) { + for( final Haplotype iii_mapped : haplotypeMapping.get(alleleOrdering.get(iii)) ) { + for( final Haplotype jjj_mapped : haplotypeMapping.get(alleleOrdering.get(jjj)) ) { double haplotypeLikelihood = 0.0; for( final String sample : samples ) { final double[] readLikelihoods_iii = iii_mapped.getReadLikelihoods(sample); @@ -200,12 +186,48 @@ public class LikelihoodCalculationEngine { } haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); } - } + } } } // normalize the diploid likelihoods matrix - return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix ); + return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix ); + } + + // This function takes a set of samples to pool over and a haplotypeMapping + @Requires({"haplotypeList.size() > 0"}) + @Ensures({"result.length == result[0].length", "result.length == haplotypeList.size()"}) + public static double[][] computeDiploidHaplotypeLikelihoods( final Set samples, final List haplotypeList ) { + + final int numHaplotypes = haplotypeList.size(); + final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes]; + for( int iii = 0; iii < numHaplotypes; iii++ ) { + Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY); + } + + // compute the diploid haplotype likelihoods + // todo - needs to be generalized to arbitrary ploidy, cleaned and merged with PairHMMIndelErrorModel code + for( int iii = 0; iii < numHaplotypes; iii++ ) { + final Haplotype iii_haplotype = haplotypeList.get(iii); + for( int jjj = 0; jjj <= iii; jjj++ ) { + final Haplotype jjj_haplotype = haplotypeList.get(jjj); + double haplotypeLikelihood = 0.0; + for( final String sample : samples ) { + final double[] readLikelihoods_iii = iii_haplotype.getReadLikelihoods(sample); + final int[] readCounts_iii = iii_haplotype.getReadCounts(sample); + final double[] readLikelihoods_jjj = jjj_haplotype.getReadLikelihoods(sample); + for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) { + // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) + // First term is approximated by Jacobian log with table lookup. + haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF ); + } + } + haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); + } + } + + // normalize the diploid likelihoods matrix + return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix ); } @Requires({"likelihoodMatrix.length == likelihoodMatrix[0].length"}) @@ -296,14 +318,7 @@ public class LikelihoodCalculationEngine { final Set sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples final ArrayList bestHaplotypesIndexList = new ArrayList(); bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype - // set up the default 1-to-1 haplotype mapping object - final ArrayList> haplotypeMapping = new ArrayList>(); - for( final Haplotype h : haplotypes ) { - final ArrayList list = new ArrayList(); - list.add(h); - haplotypeMapping.add(list); - } - final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, haplotypeMapping ); // all samples pooled together + final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, haplotypes ); // all samples pooled together int hap1 = 0; int hap2 = 0; @@ -347,7 +362,7 @@ public class LikelihoodCalculationEngine { public static Map partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, final HashMap> perSampleReadList, final HashMap> perSampleFilteredReadList, - final Pair>> call, + final Pair>> call, final double downsamplingFraction, final PrintStream downsamplingLog ) { final Map returnMap = new HashMap(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java index fd46e4e69..4f072d720 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java @@ -278,9 +278,10 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef(); final int activeRegionStop = refHaplotype.getAlignmentStartHapwrtRef() + refHaplotype.getCigar().getReferenceLength(); - for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype + // for GGA mode, add the desired allele into the haplotype + for( final VariantContext compVC : activeAllelesToGenotype ) { for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { - final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart()); + final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()); if( !addHaplotype( insertedRefHaplotype, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) { return returnHaplotypes; //throw new ReviewedStingException("Unable to add reference+allele haplotype during GGA-enabled assembly: " + insertedRefHaplotype); @@ -290,15 +291,24 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { for( final DefaultDirectedGraph graph : graphs ) { for ( final KBestPaths.Path path : KBestPaths.getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) { + final Haplotype h = new Haplotype( path.getBases( graph ), path.getScore() ); if( addHaplotype( h, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) { - if( !activeAllelesToGenotype.isEmpty() ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present + + // for GGA mode, add the desired allele into the haplotype if it isn't already present + if( !activeAllelesToGenotype.isEmpty() ) { final HashMap eventMap = GenotypingEngine.generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), fullReferenceWithPadding, h.getBases(), refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart()); - if( vcOnHaplotype == null || !vcOnHaplotype.hasSameAllelesAs(compVC) ) { + + // This if statement used to additionally have: + // "|| !vcOnHaplotype.hasSameAllelesAs(compVC)" + // but that can lead to problems downstream when e.g. you are injecting a 1bp deletion onto + // a haplotype that already contains a 1bp insertion (so practically it is reference but + // falls into the bin for the 1bp deletion because we keep track of the artificial alleles). + if( vcOnHaplotype == null ) { for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { - addHaplotype( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart()), fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ); + addHaplotype( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()), fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ); } } } @@ -369,6 +379,8 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { h.setAlignmentStartHapwrtRef( swConsensus2.getAlignmentStart2wrt1() ); h.setCigar( AlignmentUtils.leftAlignIndel(swConsensus2.getCigar(), ref, h.getBases(), swConsensus2.getAlignmentStart2wrt1(), 0) ); + if ( haplotype.isArtificialHaplotype() ) + h.setArtificialAllele(haplotype.getArtificialAllele(), haplotype.getArtificialAllelePosition()); h.leftBreakPoint = leftBreakPoint; h.rightBreakPoint = rightBreakPoint; if( swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() != activeRegionStop - activeRegionStart ) { // protect against SW failures diff --git a/protected/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java new file mode 100755 index 000000000..be19d3ef4 --- /dev/null +++ b/protected/java/test/org/broadinstitute/sting/gatk/downsampling/AlleleBiasedDownsamplingUtilsUnitTest.java @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.downsampling; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + + +/** + * Basic unit test for AlleleBiasedDownsamplingUtils + */ +public class AlleleBiasedDownsamplingUtilsUnitTest extends BaseTest { + + + @Test + public void testSmartDownsampling() { + + final int[] idealHetAlleleCounts = new int[]{0, 50, 0, 50}; + final int[] idealHomAlleleCounts = new int[]{0, 100, 0, 0}; + + // no contamination, no removal + testOneCase(0, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + + // hom sample, het contaminant, different alleles + testOneCase(5, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + testOneCase(0, 0, 5, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + testOneCase(0, 0, 0, 5, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + + // hom sample, hom contaminant, different alleles + testOneCase(10, 0, 0, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + testOneCase(0, 0, 10, 0, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + testOneCase(0, 0, 0, 10, 0.1, 100, idealHomAlleleCounts, idealHomAlleleCounts); + + // het sample, het contaminant, different alleles + testOneCase(5, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 5, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + + // het sample, hom contaminant, different alleles + testOneCase(10, 0, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 10, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + + // hom sample, het contaminant, overlapping alleles + final int[] enhancedHomAlleleCounts = new int[]{0, 105, 0, 0}; + testOneCase(5, 5, 0, 0, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts); + testOneCase(0, 5, 5, 0, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts); + testOneCase(0, 5, 0, 5, 0.1, 100, idealHomAlleleCounts, enhancedHomAlleleCounts); + + // hom sample, hom contaminant, overlapping alleles + testOneCase(0, 10, 0, 0, 0.1, 100, idealHomAlleleCounts, new int[]{0, 110, 0, 0}); + + // het sample, het contaminant, overlapping alleles + testOneCase(5, 5, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 5, 5, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 5, 0, 5, 0.1, 100, idealHetAlleleCounts, new int[]{0, 55, 0, 55}); + testOneCase(5, 0, 0, 5, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 5, 5, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + + // het sample, hom contaminant, overlapping alleles + testOneCase(0, 10, 0, 0, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + testOneCase(0, 0, 0, 10, 0.1, 100, idealHetAlleleCounts, idealHetAlleleCounts); + } + + private static void testOneCase(final int addA, final int addC, final int addG, final int addT, final double contaminationFraction, + final int pileupSize, final int[] initialCounts, final int[] targetCounts) { + + final int[] actualCounts = initialCounts.clone(); + actualCounts[0] += addA; + actualCounts[1] += addC; + actualCounts[2] += addG; + actualCounts[3] += addT; + + final int[] results = AlleleBiasedDownsamplingUtils.runSmartDownsampling(actualCounts, (int)(pileupSize * contaminationFraction)); + Assert.assertTrue(countsAreEqual(results, targetCounts)); + } + + private static boolean countsAreEqual(final int[] counts1, final int[] counts2) { + for ( int i = 0; i < 4; i++ ) { + if ( counts1[i] != counts2[i] ) + return false; + } + return true; + } +} diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java index b839382dc..de328c825 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java @@ -51,16 +51,16 @@ public class BQSRIntegrationTest extends WalkerTest { String HiSeqBam = privateTestDir + "HiSeq.1mb.1RG.bam"; String HiSeqInterval = "chr1:10,000,000-10,100,000"; return new Object[][]{ - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "387b41dc2221a1a4a782958944662b25")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "b5e26902e76abbd59f94f65c70d18165")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "a8a9c3f83269911cb61c5fe8fb98dc4a")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "f43a0473101c63ae93444c300d843e81")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "9e05e63339d4716584bfc717cab6bd0f")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "1cf9b9c9c64617dc0f3d2f203f918dbe")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "aa1949a77bc3066fee551a217c970c0d")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "4fd3c9ad97e6ac58cba644a76564c9f7")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "2620f734cce20f70ce13afd880e46e5c")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "5eb3b94e767da19a4c037ee132e4b19a")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "ab261d291b107a3da7897759c0e4fa89")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "292303f649fbb19dc05d4a0197a49eeb")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "8ced9d1094493f17fb1876b818a64541")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "abb838131e403d39820dbd66932d1ed0")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "f70d8b5358bc2f76696f14b7a807ede0")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "4c0f63e06830681560a1e9f9aad9fe98")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "be2812cd3dae3c326cf35ae3f1c8ad9e")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "8f62aa0e75770204c98d8299793cc53c")}, {new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "03c29a0c1d21f72b12daf51cec111599")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "7080b2cad02ec6e67ebc766b2dccebf8")}, {new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "30e76055c16843b6e33e5b9bd8ced57c")}, @@ -90,6 +90,21 @@ public class BQSRIntegrationTest extends WalkerTest { executeTest("testBQSRFailWithoutDBSNP", spec); } + @Test + public void testBQSRCSV() { + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + " -T BaseRecalibrator" + + " -R " + b36KGReference + + " -I " + validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.bam" + + " -knownSites " + b36dbSNP129 + + " -L 1:10,000,000-10,200,000" + + " -o /dev/null" + + " --plot_pdf_file /dev/null" + + " --intermediate_csv_file %s", + Arrays.asList("d1c38a3418979400630e2bca1140689c")); + executeTest("testBQSR-CSVfile", spec); + } + @Test public void testBQSRFailWithSolidNoCall() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java similarity index 90% rename from public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java rename to protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 9212d0e53..d3e77e002 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -30,7 +30,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("cdec335abc9ad8e59335e39a73e0e95a")); + Arrays.asList("847605f4efafef89529fe0e496315edd")); executeTest("test MultiSample Pilot1", spec); } @@ -38,7 +38,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("efddb5e258f97fd4f6661cff9eaa57de")); + Arrays.asList("5b31b811072a4df04524e13604015f9b")); executeTest("test MultiSample Pilot2 with alleles passed in", spec1); } @@ -46,7 +46,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("24532eb381724cd74e99370da28d49ed")); + Arrays.asList("d9992e55381afb43742cc9b30fcd7538")); executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); } @@ -54,7 +54,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("062a946160eec1d0fc135d58ca654ff4")); + Arrays.asList("fea530fdc8677e10be4cc11625fa5376")); executeTest("test SingleSample Pilot2", spec); } @@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("94dc17d76d841f1d3a36160767ffa034")); + Arrays.asList("704888987baacff8c7b273b8ab9938d0")); executeTest("test Multiple SNP alleles", spec); } @@ -78,7 +78,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReverseTrim() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, - Arrays.asList("9106d01ca0d0a8fedd068e72d509f380")); + Arrays.asList("e14c9b1f9f34d6c16de445bfa385be89")); executeTest("test reverse trim", spec); } @@ -86,7 +86,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("d847acf841ba8ba653f996ce4869f439")); + Arrays.asList("fb204e821a24d03bd3a671b6e01c449a")); executeTest("test mismatched PLs", spec); } @@ -96,7 +96,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "6792419c482e767a3deb28913ed2b1ad"; + private final static String COMPRESSED_OUTPUT_MD5 = "5b8f477c287770b5769b05591e35bc2d"; @Test public void testCompressedOutput() { @@ -149,7 +149,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinBaseQualityScore() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1, - Arrays.asList("56157d930da6ccd224bce1ca93f11e41")); + Arrays.asList("6ee6537e9ebc1bfc7c6cf8f04b1582ff")); executeTest("test min_base_quality_score 26", spec); } @@ -157,7 +157,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSLOD() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " --computeSLOD --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("6ccb9bd88934e4272d0ce362dd35e603")); + Arrays.asList("55760482335497086458b09e415ecf54")); executeTest("test SLOD", spec); } @@ -165,7 +165,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testNDA() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("480437dd6e2760f4ab3194431519f331")); + Arrays.asList("938e888a40182878be4c3cc4859adb69")); executeTest("test NDA", spec); } @@ -173,7 +173,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testCompTrack() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("22c039412fd387dde6125b07c9a74a25")); + Arrays.asList("7dc186d420487e4e156a24ec8dea0951")); executeTest("test using comp track", spec); } @@ -187,17 +187,17 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testOutputParameterSitesOnly() { - testOutputParameters("-sites_only", "40aeb4c9e31fe7046b72afc58e7599cb"); + testOutputParameters("-sites_only", "f99c7471127a6fb6f72e136bc873b2c9"); } @Test public void testOutputParameterAllConfident() { - testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "c706ca93b25ff83613cb4e95dcac567c"); + testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "9dbc9389db39cf9697e93e0bf529314f"); } @Test public void testOutputParameterAllSites() { - testOutputParameters("--output_mode EMIT_ALL_SITES", "8a263fd0a94463ce1de9990f2b8ec841"); + testOutputParameters("--output_mode EMIT_ALL_SITES", "81fff490c0f59890f1e75dc290833434"); } private void testOutputParameters(final String args, final String md5) { @@ -211,7 +211,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, - Arrays.asList("df524e98903d96ab9353bee7c16a69de")); + Arrays.asList("4af83a883ecc03a23b0aa6dd4b8f1ceb")); executeTest("test confidence 1", spec1); } @@ -222,12 +222,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------------------------------------- @Test public void testHeterozyosity1() { - testHeterozosity( 0.01, "8e61498ca03a8d805372a64c466b3b42" ); + testHeterozosity( 0.01, "8dd37249e0a80afa86594c3f1e720760" ); } @Test public void testHeterozyosity2() { - testHeterozosity( 1.0 / 1850, "668d06b5173cf3b97d052726988e1d7b" ); + testHeterozosity( 1.0 / 1850, "040d169e20fda56f8de009a6015eb384" ); } private void testHeterozosity(final double arg, final String md5) { @@ -251,7 +251,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("908eb5e21fa39e7fb377cf4a9c4c7835")); + Arrays.asList("0e4713e4aa44f4f8fcfea7138295a627")); executeTest(String.format("test multiple technologies"), spec); } @@ -270,7 +270,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("c814558bb0ed2e19b12e1a2bf4465d52")); + Arrays.asList("46ea5d1ceb8eed1d0db63c3577915d6c")); executeTest(String.format("test calling with BAQ"), spec); } @@ -289,7 +289,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("3593495aab5f6204c65de0b073a6ff65")); + Arrays.asList("50329e15e5139be9e3b643f0b3ba8a53")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -304,7 +304,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("8b486a098029d5a106b0a37eff541c15")); + Arrays.asList("2b85e3bd6bf981afaf7324666740d74b")); executeTest(String.format("test indel caller in SLX with low min allele count"), spec); } @@ -317,7 +317,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("18efedc50cae2aacaba372265e38310b")); + Arrays.asList("a6fd46eff78827060451a62cffd698a7")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -327,7 +327,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("3ff8c7c80a518aa3eb8671a21479de5f")); + Arrays.asList("b8129bf754490cc3c76191d8cc4ec93f")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); } @@ -337,7 +337,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("578c0540f4f2052a634a829bcb9cc27d")); + Arrays.asList("591332fa0b5b22778cf820ee257049d2")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); } @@ -345,13 +345,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSampleIndels1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("f7d0d0aee603df25c1f0525bb8df189e")); + Arrays.asList("a4761d7f25e7a62f34494801c98a0da7")); List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("fc91d457a16b4ca994959c2b5f3f0352")); + Arrays.asList("c526c234947482d1cd2ffc5102083a08")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @@ -407,7 +407,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinIndelFraction0() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.0", 1, - Arrays.asList("857b8e5df444463ac27f665c4f67fbe2")); + Arrays.asList("90adefd39ed67865b0cb275ad0f07383")); executeTest("test minIndelFraction 0.0", spec); } @@ -415,7 +415,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinIndelFraction25() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.25", 1, - Arrays.asList("81d4c7d9010fd6733b2997bc378e7471")); + Arrays.asList("2fded43949e258f8e9f68893c61c1bdd")); executeTest("test minIndelFraction 0.25", spec); } @@ -437,7 +437,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testNsInCigar() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + validationDataLocation + "testWithNs.bam -o %s -L 8:141799600-141814700", 1, - Arrays.asList("bd7984a374f0ae5d277bd5fc5065f64f")); + Arrays.asList("d6d40bacd540a41f305420dfea35e04a")); executeTest("test calling on reads with Ns in CIGAR", spec); } @@ -451,18 +451,18 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("9a7cd58b9e3d5b72608c0d529321deba")); + Arrays.asList("c1077662411164182c5f75478344f83d")); executeTest("test calling on a ReducedRead BAM", spec); } @Test public void testReducedBamSNPs() { - testReducedCalling("SNP", "e7fc11baf208a1bca7b462d3148c936e"); + testReducedCalling("SNP", "f5ccbc96d0d66832dd9b3c5cb6507db4"); } @Test public void testReducedBamINDELs() { - testReducedCalling("INDEL", "132a4e0ccf9230b5bb4b56c649e2bdd5"); + testReducedCalling("INDEL", "3c02ee5187933bed44dc416a2e28511f"); } @@ -483,7 +483,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testContaminationDownsampling() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --contamination_fraction_to_filter 0.20", 1, - Arrays.asList("27dd04159e06d9524fb8a4eef41f96ae")); + Arrays.asList("1f9071466fc40f4c6a0f58ac8e9135fb")); executeTest("test contamination_percentage_to_filter 0.20", spec); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 86f3748ce..f8ba1f4cc 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -21,17 +21,19 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "aa1df35d6e64d7ca93feb4d2dd15dd0e"); + HCTest(CEUTRIO_BAM, "", "2b39732ff8e0de5bc2ae949aaf7a6f21"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "186c7f322978283c01249c6de2829215"); + HCTest(NA12878_BAM, "", "8b217638ff585effb9cc70e9a9aa544f"); } + // TODO -- add more tests for GGA mode, especially with input alleles that are complex variants and/or not trimmed @Test public void testHaplotypeCallerMultiSampleGGA() { - HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "de9e78a52207fe62144dba5337965469"); + HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", + "541aa8291f03ba33bd1ad3d731fd5657"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -42,7 +44,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "000dbb1b48f94d017cfec127c6cabe8f"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "fd7170cbde7df04d4fbe1da7903c31c6"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -53,7 +55,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "16013a9203367c3d1c4ce1dcdc81ef4a"); + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "99456fc7207c1fe9f367a0d0afae87cd"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -64,20 +66,20 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "b369c2a6cb5c99a424551b33bae16f3b"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "6c1631785b3f832aecab1a99f0454762"); } @Test public void HCTestProblematicReadsModifiedInActiveRegions() { - final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("c306140ad28515ee06c603c225217939")); + final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("ec437d2d9f3ae07d155983be0155c8ed")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("b6c67ee8e99cc8f53a6587bb26028047")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("237601bbc39694c7413a332cbb656c8e")); executeTest("HCTestStructuralIndels: ", spec); } @@ -91,7 +93,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("4beb9f87ab3f316a9384c3d0dca6ebe9")); + Arrays.asList("40bf739fb2b1743642498efe79ea6342")); executeTest("HC calling on a ReducedRead BAM", spec); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java index e82946690..19ced9f42 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java @@ -10,7 +10,6 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.testng.Assert; -import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -102,7 +101,9 @@ public class LikelihoodCalculationEngineUnitTest extends BaseTest { haplotypes.add(haplotype); } } - return LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(haplotypes, "myTestSample"); + final HashSet sampleSet = new HashSet(1); + sampleSet.add("myTestSample"); + return LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sampleSet, haplotypes); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java index 0daad2c2b..4f9031329 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java +++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java @@ -118,17 +118,24 @@ public class CommandLineGATK extends CommandLineExecutable { public static final String DISK_QUOTA_EXCEEDED_ERROR = "Disk quota exceeded"; private static void checkForMaskedUserErrors(final Throwable t) { + // masked out of memory error + if ( t instanceof OutOfMemoryError ) + exitSystemWithUserError(new UserException.NotEnoughMemory()); + // masked user error + if ( t instanceof UserException || t instanceof TribbleException ) + exitSystemWithUserError(new UserException(t.getMessage())); + + // no message means no masked error final String message = t.getMessage(); if ( message == null ) return; - // we know what to do about the common "Too many open files" error + // too many open files error if ( message.contains("Too many open files") ) exitSystemWithUserError(new UserException.TooManyOpenFiles()); // malformed BAM looks like a SAM file - if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) || - message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) ) + if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) || message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) ) exitSystemWithSamError(t); // can't close tribble index when writing @@ -138,12 +145,10 @@ public class CommandLineGATK extends CommandLineExecutable { // disk is full if ( message.contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || message.contains(DISK_QUOTA_EXCEEDED_ERROR) ) exitSystemWithUserError(new UserException.NoSpaceOnDevice()); - if ( t.getCause() != null && (t.getCause().getMessage().contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || t.getCause().getMessage().contains(DISK_QUOTA_EXCEEDED_ERROR)) ) - exitSystemWithUserError(new UserException.NoSpaceOnDevice()); - // masked out of memory error - if ( t.getCause() != null && t.getCause() instanceof OutOfMemoryError ) - exitSystemWithUserError(new UserException.NotEnoughMemory()); + // masked error wrapped in another one + if ( t.getCause() != null ) + checkForMaskedUserErrors(t.getCause()); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java b/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java index af330bba9..34627b973 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java +++ b/public/java/src/org/broadinstitute/sting/gatk/contexts/ReferenceContext.java @@ -27,7 +27,6 @@ package org.broadinstitute.sting.gatk.contexts; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; -import net.sf.samtools.util.StringUtil; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -39,10 +38,7 @@ import org.broadinstitute.sting.utils.GenomeLocParser; * @author hanna * @version 0.1 */ - public class ReferenceContext { - final public static boolean UPPERCASE_REFERENCE = true; - /** * Facilitates creation of new GenomeLocs. */ @@ -59,7 +55,8 @@ public class ReferenceContext { final private GenomeLoc window; /** - * The bases in the window around the current locus. If null, then bases haven't been fetched yet + * The bases in the window around the current locus. If null, then bases haven't been fetched yet. + * Bases are always upper cased */ private byte[] basesCache = null; @@ -81,7 +78,7 @@ public class ReferenceContext { * * @return */ - @Ensures("result != null") + @Ensures({"result != null"}) public byte[] getBases(); } @@ -146,7 +143,9 @@ public class ReferenceContext { private void fetchBasesFromProvider() { if ( basesCache == null ) { basesCache = basesProvider.getBases(); - if (UPPERCASE_REFERENCE) StringUtil.toUpperCase(basesCache); + + // must be an assertion that only runs when the bases are fetch to run in a reasonable amount of time + assert BaseUtils.isUpperCase(basesCache); } } @@ -194,6 +193,7 @@ public class ReferenceContext { /** * All the bases in the window from the current base forward to the end of the window. */ + @Ensures({"result != null", "result.length > 0"}) public byte[] getForwardBases() { final byte[] bases = getBases(); final int mid = locus.getStart() - window.getStart(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java index 31f6d5954..8e4633869 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/io/storage/VariantContextWriterStorage.java @@ -198,8 +198,6 @@ public class VariantContextWriterStorage implements Storage extends TraversalEngine activeRegions = new LinkedList(); + ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); - ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); + ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); - // We keep processing while the next reference location is within the interval - GenomeLoc prevLoc = null; - while( locusView.hasNext() ) { - final AlignmentContext locus = locusView.next(); - GenomeLoc location = locus.getLocation(); + // We keep processing while the next reference location is within the interval + GenomeLoc prevLoc = null; + while( locusView.hasNext() ) { + final AlignmentContext locus = locusView.next(); + final GenomeLoc location = locus.getLocation(); - if(prevLoc != null) { - // fill in the active / inactive labels from the stop of the previous location to the start of this location - // TODO refactor to separate function - for(int iii = prevLoc.getStop() + 1; iii < location.getStart(); iii++ ) { - final GenomeLoc fakeLoc = engine.getGenomeLocParser().createGenomeLoc(prevLoc.getContig(), iii, iii); - if( initialIntervals == null || initialIntervals.overlaps( fakeLoc ) ) { - profile.add(fakeLoc, new ActivityProfileResult( walker.hasPresetActiveRegions() && walker.presetActiveRegions.overlaps(fakeLoc) ? 1.0 : 0.0 )); - } - } + // Grab all the previously unseen reads from this pileup and add them to the massive read list + // Note that this must occur before we leave because we are outside the intervals because + // reads may occur outside our intervals but overlap them in the future + // TODO -- this whole HashSet logic should be changed to a linked list of reads with + // TODO -- subsequent pass over them to find the ones overlapping the active regions + for( final PileupElement p : locus.getBasePileup() ) { + final GATKSAMRecord read = p.getRead(); + if( !myReads.contains(read) ) { + myReads.add(read); } - dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - // create reference context. Note that if we have a pileup of "extended events", the context will - // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). - final ReferenceContext refContext = referenceView.getReferenceContext(location); - - // Iterate forward to get all reference ordered data covering this location - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); - - // Call the walkers isActive function for this locus and add them to the list to be integrated later - if( initialIntervals == null || initialIntervals.overlaps( location ) ) { - profile.add(location, walkerActiveProb(walker, tracker, refContext, locus, location)); - } - - // Grab all the previously unseen reads from this pileup and add them to the massive read list - for( final PileupElement p : locus.getBasePileup() ) { - final GATKSAMRecord read = p.getRead(); - if( !myReads.contains(read) ) { - myReads.add(read); - } - - // If this is the last pileup for this shard calculate the minimum alignment start so that we know - // which active regions in the work queue are now safe to process - minStart = Math.min(minStart, read.getAlignmentStart()); - } - - prevLoc = location; - - printProgress(locus.getLocation()); + // If this is the last pileup for this shard calculate the minimum alignment start so that we know + // which active regions in the work queue are now safe to process + minStart = Math.min(minStart, read.getAlignmentStart()); } - updateCumulativeMetrics(dataProvider.getShard()); + // skip this location -- it's not part of our engine intervals + if ( outsideEngineIntervals(location) ) + continue; - // Take the individual isActive calls and integrate them into contiguous active regions and - // add these blocks of work to the work queue - // band-pass filter the list of isActive probabilities and turn into active regions - final ActivityProfile bandPassFiltered = profile.bandPassFilter(); - final List activeRegions = bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize ); - - // add active regions to queue of regions to process - // first check if can merge active regions over shard boundaries - if( !activeRegions.isEmpty() ) { - if( !workQueue.isEmpty() ) { - final ActiveRegion last = workQueue.getLast(); - final ActiveRegion first = activeRegions.get(0); - if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) { - workQueue.removeLast(); - activeRegions.remove(first); - workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) ); - } - } - workQueue.addAll( activeRegions ); + if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) { + // we've move across some interval boundary, restart profile + profile = incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); } - logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); + dataProvider.getShard().getReadMetrics().incrementNumIterations(); - // now go and process all of the active regions - sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig()); + // create reference context. Note that if we have a pileup of "extended events", the context will + // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). + final ReferenceContext refContext = referenceView.getReferenceContext(location); + + // Iterate forward to get all reference ordered data covering this location + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); + + // Call the walkers isActive function for this locus and add them to the list to be integrated later + profile.add(walkerActiveProb(walker, tracker, refContext, locus, location)); + + prevLoc = location; + + printProgress(locus.getLocation()); } + updateCumulativeMetrics(dataProvider.getShard()); + + if ( ! profile.isEmpty() ) + incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); + + // add active regions to queue of regions to process + // first check if can merge active regions over shard boundaries + if( !activeRegions.isEmpty() ) { + if( !workQueue.isEmpty() ) { + final ActiveRegion last = workQueue.getLast(); + final ActiveRegion first = activeRegions.get(0); + if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) { + workQueue.removeLast(); + activeRegions.remove(first); + workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) ); + } + } + workQueue.addAll( activeRegions ); + } + + logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); + + // now go and process all of the active regions + sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig()); + return sum; } + /** + * Is the loc outside of the intervals being requested for processing by the GATK? + * @param loc + * @return + */ + private boolean outsideEngineIntervals(final GenomeLoc loc) { + return engine.getIntervals() != null && ! engine.getIntervals().overlaps(loc); + } + + /** + * Take the individual isActive calls and integrate them into contiguous active regions and + * add these blocks of work to the work queue + * band-pass filter the list of isActive probabilities and turn into active regions + * + * @param profile + * @param activeRegions + * @param activeRegionExtension + * @param maxRegionSize + * @return + */ + private ActivityProfile incorporateActiveRegions(final ActivityProfile profile, + final List activeRegions, + final int activeRegionExtension, + final int maxRegionSize) { + if ( profile.isEmpty() ) + throw new IllegalStateException("trying to incorporate an empty active profile " + profile); + + final ActivityProfile bandPassFiltered = profile.bandPassFilter(); + activeRegions.addAll(bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize )); + return new ActivityProfile( engine.getGenomeLocParser(), profile.hasPresetRegions() ); + } + // -------------------------------------------------------------------------------- // @@ -150,7 +176,7 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine walker, final LocusShardDataProvider dataProvider ) { - final DataSource dataSource = WalkerManager.getWalkerDataSource(walker); - if( dataSource == DataSource.READS ) - return new CoveredLocusView(dataProvider); - else if( dataSource == DataSource.REFERENCE ) //|| ! GenomeAnalysisEngine.instance.getArguments().enableRodWalkers ) - return new AllLocusView(dataProvider); - else if( dataSource == DataSource.REFERENCE_ORDERED_DATA ) - return new RodLocusView(dataProvider); - else - throw new UnsupportedOperationException("Unsupported traversal type: " + dataSource); - } - /** * Special function called in LinearMicroScheduler to empty out the work queue. * Ugly for now but will be cleaned up when we push this functionality more into the engine diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java index 82596a501..2679a169b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java @@ -29,7 +29,7 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn final List refQuals, final List altQuals) { if (pileup != null && likelihoodMap == null) { - // no per-read likelihoods available: + // old UG snp-only path through the annotations for ( final PileupElement p : pileup ) { if ( isUsableBase(p) ) { if ( allAlleles.get(0).equals(Allele.create(p.getBase(), true)) ) { @@ -43,14 +43,13 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn } for (Map.Entry> el : likelihoodMap.getLikelihoodReadMap().entrySet()) { final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + // BUGBUG: There needs to be a comparable isUsableBase check here if (a.isNoCall()) continue; // read is non-informative if (a.isReference()) refQuals.add((double)el.getKey().getMappingQuality()); else if (allAlleles.contains(a)) altQuals.add((double)el.getKey().getMappingQuality()); - - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index 0df7aff71..e7c0e6b14 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -49,7 +49,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR ReadBackedPileup pileup = null; - if (stratifiedContexts != null) { + if (stratifiedContexts != null) { // the old UG SNP-only path through the annotations final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); if ( context != null ) pileup = context.getBasePileup(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java index d01233bb2..334b89f01 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java @@ -39,7 +39,7 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio final List refQuals, final List altQuals) { if (alleleLikelihoodMap == null) { - // use fast SNP-based version if we don't have per-read allele likelihoods + // use old UG SNP-based version if we don't have per-read allele likelihoods for ( final PileupElement p : pileup ) { if ( isUsableBase(p) ) { int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p, 0, 0); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index c4de9ed45..92060b4a3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -82,7 +82,7 @@ import java.util.*; @Allows(value={DataSource.READS, DataSource.REFERENCE}) @Reference(window=@Window(start=-50,stop=50)) @By(DataSource.REFERENCE) -public class VariantAnnotator extends RodWalker implements AnnotatorCompatible { +public class VariantAnnotator extends RodWalker implements AnnotatorCompatible, TreeReducible { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); @@ -275,14 +275,6 @@ public class VariantAnnotator extends RodWalker implements Ann return true; } - /** - * Initialize the number of loci processed to zero. - * - * @return 0 - */ - public Integer reduceInit() { return 0; } - - /** * We want reads that span deletions * @@ -323,15 +315,15 @@ public class VariantAnnotator extends RodWalker implements Ann return 1; } - /** - * Increment the number of loci processed. - * - * @param value result of the map. - * @param sum accumulator for the reduce. - * @return the new number of loci processed. - */ - public Integer reduce(Integer value, Integer sum) { - return sum + value; + @Override + public Integer reduceInit() { return 0; } + + @Override + public Integer reduce(Integer value, Integer sum) { return value + sum; } + + @Override + public Integer treeReduce(Integer lhs, Integer rhs) { + return lhs + rhs; } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java index ee4f77752..725097ddc 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorEngine.java @@ -277,8 +277,12 @@ public class VariantAnnotatorEngine { if ( expression.fieldName.equals("ID") ) { if ( vc.hasID() ) infoAnnotations.put(expression.fullName, vc.getID()); + } else if (expression.fieldName.equals("ALT")) { + infoAnnotations.put(expression.fullName, vc.getAlternateAllele(0).getDisplayString()); + } else if ( vc.hasAttribute(expression.fieldName) ) { - infoAnnotations.put(expression.fullName, vc.getAttribute(expression.fieldName)); + infoAnnotations.put(expression.fullName, vc.getAttribute(expression.fieldName)); + } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index 9506510a9..7ce98cf1d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -227,7 +227,7 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche */ public Long map( final ReferenceContext ref, final GATKSAMRecord originalRead, final RefMetaDataTracker metaDataTracker ) { - final GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(originalRead); + final GATKSAMRecord read = ReadClipper.hardClipSoftClippedBases( ReadClipper.hardClipAdaptorSequence(originalRead) ); if( read.isEmpty() ) { return 0L; } // the whole read was inside the adaptor so skip it RecalUtils.parsePlatformForRead(read, RAC); @@ -268,16 +268,25 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche } protected boolean[] calculateKnownSites( final GATKSAMRecord read, final List features ) { - final int BUFFER_SIZE = 0; final int readLength = read.getReadBases().length; final boolean[] knownSites = new boolean[readLength]; Arrays.fill(knownSites, false); for( final Feature f : features ) { int featureStartOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getStart(), ReadUtils.ClippingTail.LEFT_TAIL, true); // BUGBUG: should I use LEFT_TAIL here? - if( featureStartOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { featureStartOnRead = 0; } + if( featureStartOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { + featureStartOnRead = 0; + } + int featureEndOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getEnd(), ReadUtils.ClippingTail.LEFT_TAIL, true); - if( featureEndOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { featureEndOnRead = readLength; } - Arrays.fill(knownSites, Math.max(0, featureStartOnRead - BUFFER_SIZE), Math.min(readLength, featureEndOnRead + 1 + BUFFER_SIZE), true); + if( featureEndOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { + featureEndOnRead = readLength; + } + + if( featureStartOnRead > readLength ) { + featureStartOnRead = featureEndOnRead = readLength; + } + + Arrays.fill(knownSites, Math.max(0, featureStartOnRead), Math.min(readLength, featureEndOnRead + 1), true); } return knownSites; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java index fc7d8a8a4..c64482151 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java @@ -75,8 +75,9 @@ public class RecalibrationArgumentCollection { /** * If not provided, then a temporary file is created and then deleted upon completion. + * For advanced users only. */ - @Hidden + @Advanced @Argument(fullName = "intermediate_csv_file", shortName = "intermediate", doc = "The intermediate csv file to create", required = false) public File RECAL_CSV_FILE = null; @@ -101,13 +102,10 @@ public class RecalibrationArgumentCollection { @Argument(fullName = "no_standard_covs", shortName = "noStandard", doc = "Do not use the standard set of covariates, but rather just the ones listed using the -cov argument", required = false) public boolean DO_NOT_USE_STANDARD_COVARIATES = false; - ///////////////////////////// - // Debugging-only Arguments - ///////////////////////////// /** * This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option. */ - @Hidden + @Advanced @Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.") public boolean RUN_WITHOUT_DBSNP = false; @@ -138,6 +136,13 @@ public class RecalibrationArgumentCollection { @Argument(fullName = "indels_context_size", shortName = "ics", doc = "size of the k-mer context to be used for base insertions and deletions", required = false) public int INDELS_CONTEXT_SIZE = 3; + /** + * The cycle covariate will generate an error if it encounters a cycle greater than this value. + * This argument is ignored if the Cycle covariate is not used. + */ + @Argument(fullName = "maximum_cycle_value", shortName = "maxCycle", doc = "the maximum cycle value permitted for the Cycle covariate", required = false) + public int MAXIMUM_CYCLE_VALUE = 500; + /** * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off) */ @@ -175,9 +180,15 @@ public class RecalibrationArgumentCollection { @Argument(fullName = "binary_tag_name", shortName = "bintag", required = false, doc = "the binary tag covariate name if using it") public String BINARY_TAG_NAME = null; + + ///////////////////////////// + // Debugging-only Arguments + ///////////////////////////// + @Hidden @Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") public String DEFAULT_PLATFORM = null; + @Hidden @Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") public String FORCE_PLATFORM = null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java index 58ddd0879..48019efea 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLoci.java @@ -191,7 +191,7 @@ public class CallableLoci extends LocusWalker minMapQ and base quality > minBaseQ in the context * as an array of ints, indexed by the index fields of BaseUtils @@ -64,10 +78,10 @@ public class CoverageUtils { } public static Map> - getBaseCountsByPartition(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ, Collection types) { + getBaseCountsByPartition(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ, CountPileupType countType, Collection types) { Map> countsByIDByType = new HashMap>(); - Map countsByRG = getBaseCountsByReadGroup(context,minMapQ,maxMapQ,minBaseQ,maxBaseQ); + Map countsByRG = getBaseCountsByReadGroup(context,minMapQ,maxMapQ,minBaseQ,maxBaseQ,countType); for (DoCOutputType.Partition t : types ) { // iterate through the read group counts and build the type associations for ( Map.Entry readGroupCountEntry : countsByRG.entrySet() ) { @@ -95,31 +109,95 @@ public class CoverageUtils { } } - public static Map getBaseCountsByReadGroup(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ) { + public static Map getBaseCountsByReadGroup(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ, CountPileupType countType) { Map countsByRG = new HashMap(); - for ( PileupElement e : context.getBasePileup() ) { - if ( e.getMappingQual() >= minMapQ && e.getMappingQual() <= maxMapQ && ( e.getQual() >= minBaseQ && e.getQual() <= maxBaseQ || e.isDeletion() ) ) { - SAMReadGroupRecord readGroup = getReadGroup(e.getRead()); - if ( ! countsByRG.keySet().contains(readGroup) ) { - countsByRG.put(readGroup,new int[6]); - updateCounts(countsByRG.get(readGroup),e); - } else { - updateCounts(countsByRG.get(readGroup),e); + + List countPileup = new LinkedList(); + FragmentCollection fpile; + + switch (countType) { + + case COUNT_READS: + for (PileupElement e : context.getBasePileup()) + if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ)) + countPileup.add(e); + break; + + case COUNT_FRAGMENTS: // ignore base identities and put in FIRST base that passes filters: + fpile = context.getBasePileup().getStartSortedPileup().toFragments(); + + for (PileupElement e : fpile.getSingletonReads()) + if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ)) + countPileup.add(e); + + for (List overlappingPair : fpile.getOverlappingPairs()) { + // iterate over all elements in fragment: + for (PileupElement e : overlappingPair) { + if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ)) { + countPileup.add(e); // add the first passing element per fragment + break; + } + } } - } + break; + + case COUNT_FRAGMENTS_REQUIRE_SAME_BASE: + fpile = context.getBasePileup().getStartSortedPileup().toFragments(); + + for (PileupElement e : fpile.getSingletonReads()) + if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ)) + countPileup.add(e); + + for (List overlappingPair : fpile.getOverlappingPairs()) { + PileupElement firstElem = null; + PileupElement addElem = null; + + // iterate over all elements in fragment: + for (PileupElement e : overlappingPair) { + if (firstElem == null) + firstElem = e; + else if (e.getBase() != firstElem.getBase()) { + addElem = null; + break; + } + + // will add the first passing element per base-consistent fragment: + if (addElem == null && countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ)) + addElem = e; + } + + if (addElem != null) + countPileup.add(addElem); + } + break; + + default: + throw new UserException("Must use valid CountPileupType"); + } + + for (PileupElement e : countPileup) { + SAMReadGroupRecord readGroup = getReadGroup(e.getRead()); + if (!countsByRG.keySet().contains(readGroup)) + countsByRG.put(readGroup, new int[6]); + + updateCounts(countsByRG.get(readGroup), e); } return countsByRG; } + private static boolean countElement(PileupElement e, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ) { + return (e.getMappingQual() >= minMapQ && e.getMappingQual() <= maxMapQ && ( e.getQual() >= minBaseQ && e.getQual() <= maxBaseQ || e.isDeletion() )); + } + private static void updateCounts(int[] counts, PileupElement e) { if ( e.isDeletion() ) { - counts[BaseUtils.DELETION_INDEX]++; + counts[BaseUtils.DELETION_INDEX] += e.getRepresentativeCount(); } else if ( BaseUtils.basesAreEqual((byte) 'N', e.getBase()) ) { - counts[BaseUtils.NO_CALL_INDEX]++; + counts[BaseUtils.NO_CALL_INDEX] += e.getRepresentativeCount(); } else { try { - counts[BaseUtils.simpleBaseToBaseIndex(e.getBase())]++; + counts[BaseUtils.simpleBaseToBaseIndex(e.getBase())] += e.getRepresentativeCount(); } catch (ArrayIndexOutOfBoundsException exc) { throw new ReviewedStingException("Expected a simple base, but actually received"+(char)e.getBase()); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java index 44b0d74ca..fe9942662 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java @@ -129,11 +129,15 @@ public class DepthOfCoverage extends LocusWalker { int depth = ThresHolder.DEFAULTS.getFilteredCoverage(context.getBasePileup()); // note the linear probability scale - return new ActivityProfileResult(Math.min(depth / coverageThreshold, 1)); + return new ActivityProfileResult(ref.getLocus(), Math.min(depth / coverageThreshold, 1)); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java index 2b9744b89..22c6097cf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java @@ -47,6 +47,12 @@ import java.util.List; *

* Given variant tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s). * Additionally, allows for one or more "snpmask" VCFs to set overlapping bases to 'N'. + * + * The output format can be partially controlled using the provided command-line arguments. + * Specify intervals with the usual -L argument to output only the reference bases within your intervals. + * Overlapping intervals are automatically merged; reference bases for each disjoint interval will be output as a + * separate fasta sequence (named numerically in order). + * * Several important notes: * 1) if there are multiple variants that start at a site, it chooses one of them randomly. * 2) when there are overlapping indels (but with different start positions) only the first will be chosen. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 97254c478..80bc04845 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -190,6 +190,10 @@ public class UnifiedGenotyperEngine { final VariantContext vc = calculateLikelihoods(tracker, refContext, stratifiedContexts, AlignmentContextUtils.ReadOrientation.COMPLETE, null, true, model, perReadAlleleLikelihoodMap); if ( vc != null ) results.add(calculateGenotypes(tracker, refContext, rawContext, stratifiedContexts, vc, model, true, perReadAlleleLikelihoodMap)); +// todo - uncomment if we want to also emit a null ref call (with no QUAL) if there's no evidence for REF and if EMIT_ALL_SITES is set +// else if (UAC.OutputMode == OUTPUT_MODE.EMIT_ALL_SITES) +// results.add(generateEmptyContext(tracker, refContext, null, rawContext)); + } } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 998894fbf..345f79b2b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -25,7 +25,7 @@ package org.broadinstitute.sting.gatk.walkers.indels; -import net.sf.picard.reference.IndexedFastaSequenceFile; +import com.google.java.contract.Requires; import net.sf.samtools.*; import net.sf.samtools.util.RuntimeIOException; import net.sf.samtools.util.SequenceUtil; @@ -236,6 +236,8 @@ public class IndelRealigner extends ReadWalker { * then extensions (".bam" or ".sam") will be stripped from the input file names and the provided string value will be pasted on instead; 2) if the * value ends with a '.map' (e.g. input_output.map), then the two-column tab-separated file with the specified name must exist and list unique output * file name (2nd column) for each input file name (1st column). + * + * Note that some GATK arguments do NOT work in conjunction with nWayOut (e.g. --disable_bam_indexing). */ @Argument(fullName="nWayOut", shortName="nWayOut", required=false, doc="Generate one output file for each input (-I) bam file") protected String N_WAY_OUT = null; @@ -274,7 +276,7 @@ public class IndelRealigner extends ReadWalker { protected String OUT_SNPS = null; // fasta reference reader to supplement the edges of the reference sequence - private IndexedFastaSequenceFile referenceReader; + private CachingIndexedFastaSequenceFile referenceReader; // the intervals input by the user private Iterator intervals = null; @@ -1601,7 +1603,8 @@ public class IndelRealigner extends ReadWalker { public List getReads() { return reads; } - public byte[] getReference(IndexedFastaSequenceFile referenceReader) { + @Requires("referenceReader.isUppercasingBases()") + public byte[] getReference(CachingIndexedFastaSequenceFile referenceReader) { // set up the reference if we haven't done so yet if ( reference == null ) { // first, pad the reference to handle deletions in narrow windows (e.g. those with only 1 read) @@ -1609,7 +1612,6 @@ public class IndelRealigner extends ReadWalker { int padRight = Math.min(loc.getStop()+REFERENCE_PADDING, referenceReader.getSequenceDictionary().getSequence(loc.getContig()).getSequenceLength()); loc = getToolkit().getGenomeLocParser().createGenomeLoc(loc.getContig(), padLeft, padRight); reference = referenceReader.getSubsequenceAt(loc.getContig(), loc.getStart(), loc.getStop()).getBases(); - StringUtil.toUpperCase(reference); } return reference; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java index a73e125ad..201028d99 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEval.java @@ -183,6 +183,10 @@ public class VariantEval extends RodWalker implements TreeRedu @Argument(fullName="keepAC0", shortName="keepAC0", doc="If provided, modules that track polymorphic sites will not require that a site have AC > 0 when the input eval has genotypes", required=false) private boolean keepSitesWithAC0 = false; + @Hidden + @Argument(fullName="numSamples", shortName="numSamples", doc="If provided, modules that track polymorphic sites will not require that a site have AC > 0 when the input eval has genotypes", required=false) + private int numSamplesFromArgument = 0; + /** * If true, VariantEval will treat -eval 1 -eval 2 as separate tracks from the same underlying * variant set, and evaluate the union of the results. Useful when you want to do -eval chr1.vcf -eval chr2.vcf etc. @@ -589,6 +593,14 @@ public class VariantEval extends RodWalker implements TreeRedu public boolean isSubsettingToSpecificSamples() { return isSubsettingSamples; } public Set getSampleNamesForEvaluation() { return sampleNamesForEvaluation; } + public int getNumberOfSamplesForEvaluation() { + if (sampleNamesForEvaluation!= null && !sampleNamesForEvaluation.isEmpty()) + return sampleNamesForEvaluation.size(); + else { + return numSamplesFromArgument; + } + + } public Set getSampleNamesForStratification() { return sampleNamesForStratification; } public List> getComps() { return comps; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java deleted file mode 100755 index 347ca56b8..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/evaluators/VariantQualityScore.java +++ /dev/null @@ -1,249 +0,0 @@ -/* - * Copyright (c) 2010 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR - * THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.walkers.varianteval.evaluators; - -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.Analysis; -import org.broadinstitute.sting.gatk.walkers.varianteval.util.DataPoint; -import org.broadinstitute.sting.utils.collections.Pair; -import org.broadinstitute.sting.utils.variantcontext.Allele; -import org.broadinstitute.sting.utils.variantcontext.VariantContext; -import org.broadinstitute.sting.utils.variantcontext.VariantContextUtils; - -import java.util.ArrayList; -import java.util.HashMap; - -/** - * @author rpoplin - * @since Apr 6, 2010 - */ - -//@Analysis(name = "Variant Quality Score", description = "Shows various stats of sets of variants binned by variant quality score") -@Deprecated -public class VariantQualityScore { - // TODO - this should really be a stratification - -// public class VariantQualityScore extends VariantEvaluator { -// -// // a mapping from quality score histogram bin to Ti/Tv ratio -// @DataPoint(description = "the Ti/Tv ratio broken out by variant quality") -// TiTvStats titvStats = null; -// -// @DataPoint(description = "average variant quality for each allele count") -// AlleleCountStats alleleCountStats = null; -// -// static class TiTvStats extends TableType { -// final static int NUM_BINS = 20; -// final HashMap> qualByIsTransition = new HashMap>(); // A hashMap holds all the qualities until we are able to bin them appropriately -// final long transitionByQuality[] = new long[NUM_BINS]; -// final long transversionByQuality[] = new long[NUM_BINS]; -// final double titvByQuality[] = new double[NUM_BINS]; // the final ti/tv sets that get reported out -// -// public Object[] getRowKeys() { -// return new String[]{"sample"}; -// } -// -// public Object[] getColumnKeys() { -// final String columnKeys[] = new String[NUM_BINS]; -// for( int iii = 0; iii < NUM_BINS; iii++ ) { -// columnKeys[iii] = "titvBin" + iii; -// } -// return columnKeys; -// } -// -// public String getCell(int x, int y) { -// return String.valueOf(titvByQuality[y]); -// } -// -// public String toString() { -// StringBuffer returnString = new StringBuffer(); -// // output the ti/tv array -// returnString.append("titvByQuality: "); -// for( int iii = 0; iii < NUM_BINS; iii++ ) { -// returnString.append(titvByQuality[iii]); -// returnString.append(" "); -// } -// return returnString.toString(); -// } -// -// public void incrValue( final double qual, final boolean isTransition ) { -// final Integer qualKey = Math.round((float) qual); -// final long numTransition = (isTransition ? 1L : 0L); -// final long numTransversion = (isTransition ? 0L : 1L); -// if( qualByIsTransition.containsKey(qualKey) ) { -// Pair transitionPair = qualByIsTransition.get(qualKey); -// transitionPair.set(transitionPair.getFirst() + numTransition, transitionPair.getSecond() + numTransversion); -// qualByIsTransition.put(qualKey, transitionPair); -// } else { -// qualByIsTransition.put(qualKey, new Pair(numTransition,numTransversion)); -// } -// } -// -// public void organizeTiTvTables() { -// for( int iii = 0; iii < NUM_BINS; iii++ ) { -// transitionByQuality[iii] = 0L; -// transversionByQuality[iii] = 0L; -// titvByQuality[iii] = 0.0; -// } -// -// int maxQual = 0; -// -// // Calculate the maximum quality score in order to normalize and histogram -// for( final Integer qual : qualByIsTransition.keySet() ) { -// if( qual > maxQual ) { -// maxQual = qual; -// } -// } -// -// final double binSize = ((double)maxQual) / ((double) (NUM_BINS-1)); -// -// for( final Integer qual : qualByIsTransition.keySet() ) { -// final int index = (int)Math.floor( ((double) qual) / binSize ); -// if( index >= 0 ) { // BUGBUG: why is there overflow here? -// Pair transitionPair = qualByIsTransition.get(qual); -// transitionByQuality[index] += transitionPair.getFirst(); -// transversionByQuality[index] += transitionPair.getSecond(); -// } -// } -// -// for( int iii = 0; iii < NUM_BINS; iii++ ) { -// if( transitionByQuality[iii] + transversionByQuality[iii] > 800L ) { // need to have a sufficient number of variants to get a useful Ti/Tv ratio -// titvByQuality[iii] = ((double) transitionByQuality[iii]) / ((double) transversionByQuality[iii]); -// } else { -// titvByQuality[iii] = 0.0; -// } -// } -// -// } -// } -// -// class AlleleCountStats extends TableType { -// final HashMap> qualityListMap = new HashMap>(); -// final HashMap qualityMap = new HashMap(); -// -// public Object[] getRowKeys() { -// final int NUM_BINS = qualityListMap.keySet().size(); -// final String rowKeys[] = new String[NUM_BINS]; -// int iii = 0; -// for( final Integer key : qualityListMap.keySet() ) { -// rowKeys[iii] = "AC" + key; -// iii++; -// } -// return rowKeys; -// -// } -// -// public Object[] getColumnKeys() { -// return new String[]{"alleleCount","avgQual"}; -// } -// -// public String getCell(int x, int y) { -// int iii = 0; -// for( final Integer key : qualityListMap.keySet() ) { -// if(iii == x) { -// if(y == 0) { return String.valueOf(key); } -// else { return String.valueOf(qualityMap.get(key)); } -// } -// iii++; -// } -// return null; -// } -// -// public String toString() { -// String returnString = ""; -// // output the quality map -// returnString += "AlleleCountStats: "; -// //for( int iii = 0; iii < NUM_BINS; iii++ ) { -// // returnString += titvByQuality[iii] + " "; -// //} -// return returnString; -// } -// -// public void incrValue( final double qual, final int alleleCount ) { -// ArrayList list = qualityListMap.get(alleleCount); -// if(list==null) { list = new ArrayList(); } -// list.add(qual); -// qualityListMap.put(alleleCount, list); -// } -// -// public void organizeAlleleCountTables() { -// for( final Integer key : qualityListMap.keySet() ) { -// final ArrayList list = qualityListMap.get(key); -// double meanQual = 0.0; -// final double numQuals = (double)list.size(); -// for( Double qual : list ) { -// meanQual += qual / numQuals; -// } -// qualityMap.put(key, meanQual); -// } -// } -// } -// -// //public VariantQualityScore(VariantEvalWalker parent) { -// //super(parent); -// //} -// -// public String getName() { -// return "VariantQualityScore"; -// } -// -// public int getComparisonOrder() { -// return 1; // we only need to see each eval track -// } -// -// public String toString() { -// return getName(); -// } -// -// public String update1(VariantContext eval, RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { -// final String interesting = null; -// -// if( eval != null && eval.isSNP() && eval.isBiallelic() && eval.isPolymorphicInSamples() ) { //BUGBUG: only counting biallelic sites (revisit what to do with triallelic sites) -// if( titvStats == null ) { titvStats = new TiTvStats(); } -// titvStats.incrValue(eval.getPhredScaledQual(), VariantContextUtils.isTransition(eval)); -// -// if( alleleCountStats == null ) { alleleCountStats = new AlleleCountStats(); } -// int alternateAlleleCount = 0; -// for (final Allele a : eval.getAlternateAlleles()) { -// alternateAlleleCount += eval.getCalledChrCount(a); -// } -// alleleCountStats.incrValue(eval.getPhredScaledQual(), alternateAlleleCount); -// } -// -// return interesting; // This module doesn't capture any interesting sites, so return null -// } -// -// public void finalizeEvaluation() { -// if( titvStats != null ) { -// titvStats.organizeTiTvTables(); -// } -// if( alleleCountStats != null ) { -// alleleCountStats.organizeAlleleCountTables(); -// } -// } -} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java index e6efd4482..7197fc14c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/stratifications/AlleleCount.java @@ -29,7 +29,7 @@ public class AlleleCount extends VariantStratifier { // There are ploidy x n sample chromosomes // TODO -- generalize to handle multiple ploidy - nchrom = getVariantEvalWalker().getSampleNamesForEvaluation().size() * getVariantEvalWalker().getSamplePloidy(); + nchrom = getVariantEvalWalker().getNumberOfSamplesForEvaluation() * getVariantEvalWalker().getSamplePloidy(); if ( nchrom < 2 ) throw new UserException.BadArgumentValue("AlleleCount", "AlleleCount stratification requires an eval vcf with at least one sample"); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java index d1b7cb96f..9253446c8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java @@ -151,14 +151,6 @@ import java.util.*; * -mvq 50 \ * -o violations.vcf * - * Creating a sample of exactly 1000 variants randomly chosen with equal probability from the variant VCF: - * java -Xmx2g -jar GenomeAnalysisTK.jar \ - * -R ref.fasta \ - * -T SelectVariants \ - * --variant input.vcf \ - * -o output.vcf \ - * -number 1000 - * * Creating a set with 50% of the total number of variants in the variant VCF: * java -Xmx2g -jar GenomeAnalysisTK.jar \ * -R ref.fasta \ @@ -659,7 +651,10 @@ public class SelectVariants extends RodWalker implements TreeR return (g !=null && !g.isHomRef() && (g.isCalled() || (g.isFiltered() && !EXCLUDE_FILTERED))); } - private boolean haveSameGenotypes(Genotype g1, Genotype g2) { + private boolean haveSameGenotypes(final Genotype g1, final Genotype g2) { + if ( g1 == null || g2 == null ) + return false; + if ((g1.isCalled() && g2.isFiltered()) || (g2.isCalled() && g1.isFiltered()) || (g1.isFiltered() && g2.isFiltered() && EXCLUDE_FILTERED)) diff --git a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java index 69920ece4..53a49d8b2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java @@ -24,33 +24,6 @@ public class BaseUtils { public final static byte[] BASES = {'A', 'C', 'G', 'T'}; public final static byte[] EXTENDED_BASES = {'A', 'C', 'G', 'T', 'N', 'D'}; - public enum Base { - A('A', 0), - C('C', 1), - G('G', 2), - T('T', 3); - - byte b; - int index; - - private Base(char base, int index) { - this.b = (byte) base; - this.index = index; - } - - public byte getBase() { return b; } - - public char getBaseAsChar() { return (char) b; } - - public int getIndex() { return index; } - - public boolean sameBase(byte o) { return b == o; } - - public boolean sameBase(char o) { return b == (byte) o; } - - public boolean sameBase(int i) { return index == i; } - } - static private final int[] baseIndexMap = new int[256]; static { Arrays.fill(baseIndexMap, -1); @@ -130,6 +103,17 @@ public class BaseUtils { return false; } + public static boolean isUpperCase(final byte[] bases) { + for ( byte base : bases ) + if ( ! isUpperCase(base) ) + return false; + return true; + } + + public static boolean isUpperCase(final byte base) { + return base >= 'A' && base <= 'Z'; + } + /** * Converts a IUPAC nucleotide code to a pair of bases * @@ -271,59 +255,6 @@ public class BaseUtils { } } - /** - * Converts a base index to a base index representing its cross-talk partner - * - * @param baseIndex 0, 1, 2, 3 - * @return 1, 0, 3, 2, or -1 if the index can't be understood - */ - static public int crossTalkPartnerIndex(int baseIndex) { - switch (baseIndex) { - case 0: - return 1; // A -> C - case 1: - return 0; // C -> A - case 2: - return 3; // G -> T - case 3: - return 2; // T -> G - default: - return -1; - } - } - - /** - * Converts a base to the base representing its cross-talk partner - * - * @param base [AaCcGgTt] - * @return C, A, T, G, or '.' if the base can't be understood - */ - @Deprecated - static public char crossTalkPartnerBase(char base) { - return (char) baseIndexToSimpleBase(crossTalkPartnerIndex(simpleBaseToBaseIndex(base))); - } - - /** - * Return the complement of a base index. - * - * @param baseIndex the base index (0:A, 1:C, 2:G, 3:T) - * @return the complementary base index - */ - static public byte complementIndex(int baseIndex) { - switch (baseIndex) { - case 0: - return 3; // a -> t - case 1: - return 2; // c -> g - case 2: - return 1; // g -> c - case 3: - return 0; // t -> a - default: - return -1; // wtf? - } - } - /** * Return the complement (A <-> T or C <-> G) of a base, or the specified base if it can't be complemented (i.e. an ambiguous base). * @@ -350,7 +281,7 @@ public class BaseUtils { } @Deprecated - static public char simpleComplement(char base) { + static private char simpleComplement(char base) { return (char) simpleComplement((byte) base); } @@ -370,22 +301,6 @@ public class BaseUtils { return rcbases; } - /** - * Complement a byte array of bases (that is, chars casted to bytes, *not* base indices in byte form) - * - * @param bases the byte array of bases - * @return the complement of the base byte array - */ - static public byte[] simpleComplement(byte[] bases) { - byte[] rcbases = new byte[bases.length]; - - for (int i = 0; i < bases.length; i++) { - rcbases[i] = simpleComplement(bases[i]); - } - - return rcbases; - } - /** * Reverse complement a char array of bases * @@ -403,23 +318,6 @@ public class BaseUtils { return rcbases; } - /** - * Complement a char array of bases - * - * @param bases the char array of bases - * @return the complement of the base char array - */ - @Deprecated - static public char[] simpleComplement(char[] bases) { - char[] rcbases = new char[bases.length]; - - for (int i = 0; i < bases.length; i++) { - rcbases[i] = simpleComplement(bases[i]); - } - - return rcbases; - } - /** * Reverse complement a String of bases. Preserves ambiguous bases. * @@ -431,17 +329,6 @@ public class BaseUtils { return new String(simpleReverseComplement(bases.getBytes())); } - /** - * Complement a String of bases. Preserves ambiguous bases. - * - * @param bases the String of bases - * @return the complement of the String - */ - @Deprecated - static public String simpleComplement(String bases) { - return new String(simpleComplement(bases.getBytes())); - } - /** * Returns the uppercased version of the bases * @@ -543,82 +430,4 @@ public class BaseUtils { return randomBaseIndex; } - - /** - * Return a random base (A, C, G, T). - * - * @return a random base (A, C, G, T) - */ - @Deprecated - static public byte getRandomBase() { - return getRandomBase('.'); - } - - /** - * Return a random base, excluding some base. - * - * @param excludeBase the base to exclude - * @return a random base, excluding the one specified (A, C, G, T) - */ - @Deprecated - static public byte getRandomBase(char excludeBase) { - return BaseUtils.baseIndexToSimpleBase(getRandomBaseIndex(BaseUtils.simpleBaseToBaseIndex(excludeBase))); - } - - /** - * Computes the smallest period >= minPeriod for the specified string. The period is defined as such p, - * that for all i = 0... seq.length-1, seq[ i % p ] = seq[i] (or equivalently seq[i] = seq[i+p] for i=0...seq.length-1-p). - * The sequence does not have to contain whole number of periods. For instance, "ACACACAC" has a period - * of 2 (it has a period of 4 as well), and so does - * "ACACA"; similarly, smallest periods of "CTCCTC", "CTCCT", and "CTCC" are all equal to 3. The "trivial" period is - * the length of the string itself, and it will always be returned if no smaller period can be found in the specified period range - * or if specified minPeriod is greater than the sequence length. - * - * @param seq - * @return - */ - public static int sequencePeriod(byte[] seq, int minPeriod) { - int period = (minPeriod > seq.length ? seq.length : minPeriod); - // we assume that bases [0,period-1] repeat themselves and check this assumption - // until we find correct period - - for (int pos = period; pos < seq.length; pos++) { - - int offset = pos % period; // we are currenlty 'offset' bases into the putative repeat of period 'period' - // if our current hypothesis holds, base[pos] must be the same as base[offset] - - if (Character.toUpperCase(seq[pos]) != Character.toUpperCase(seq[offset])) { - - // period we have been trying so far does not work. - // two possibilities: - // A) offset = 0, i.e. current position pos must be start of the next repeat, but it is not; - // in this case only bases from start up to the current one, inclusive, may form a repeat, if at all; - // so period is at least pos+1 (remember, pos is 0-based), then on the next loop re-entrance - // pos will be autoincremented and we will be checking next base - // B) offset != 0, i.e. the current base breaks the repeat, but maybe it starts a new one? - // hence we should first check if it matches the first base of the sequence, and to do that - // we set period to pos (thus trying the hypothesis that bases from start up to the current one, - // non-inclusive are repeated hereafter), and decrement pos (this will re-test current base against the first base - // on the next loop re-entrance after pos is autoincremented) - if (offset == 0) - period = pos + 1; - else - period = pos--; - - } - } - return period; - } } - -/* code snippet for testing sequencePeriod(): - * - * String str = "CCTTG"; - int p = 0; - System.out.print("Periods of " + str +" are:"); - while ( p < str.length() ) { - p = sequencePeriod(str, p+1); - System.out.print(" "+p); - } - System.out.println(); System.exit(1); -*/ diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index 6df9c9f1d..4d2c26a79 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -315,6 +315,20 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome return ( comparison == -1 || ( comparison == 0 && this.getStop() < that.getStart() )); } + /** + * Tests whether this genome loc starts at the same position as that. + * + * i.e., do this and that have the same contig and the same start position + * + * @param that genome loc to compare to + * @return true if this and that have the same contig and the same start position + */ + @Requires("that != null") + public final boolean startsAt( GenomeLoc that ) { + int comparison = this.compareContigs(that); + return comparison == 0 && this.getStart() == that.getStart(); + } + /** * Tests whether any portion of this contig is before that contig. * @param that Other contig to test. diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java index d11adf9e3..394220106 100755 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java @@ -43,6 +43,9 @@ public class GenomeLocSortedSet extends AbstractSet { // our private storage for the GenomeLoc's private List mArray = new ArrayList(); + // cache this to make overlap checking much more efficient + private int previousOverlapSearchIndex = -1; + /** default constructor */ public GenomeLocSortedSet(GenomeLocParser parser) { this.genomeLocParser = parser; @@ -101,7 +104,7 @@ public class GenomeLocSortedSet extends AbstractSet { * Return the number of bps before loc in the sorted set * * @param loc the location before which we are counting bases - * @return + * @return the number of base pairs over all previous intervals */ public long sizeBeforeLoc(GenomeLoc loc) { long s = 0; @@ -110,7 +113,7 @@ public class GenomeLocSortedSet extends AbstractSet { if ( e.isBefore(loc) ) s += e.size(); else if ( e.isPast(loc) ) - ; // don't do anything + break; // we are done else // loc is inside of s s += loc.getStart() - e.getStart(); } @@ -131,15 +134,43 @@ public class GenomeLocSortedSet extends AbstractSet { * Determine if the given loc overlaps any loc in the sorted set * * @param loc the location to test - * @return + * @return trip if the location overlaps any loc */ public boolean overlaps(final GenomeLoc loc) { - for(final GenomeLoc e : mArray) { - if(e.overlapsP(loc)) { - return true; - } + // edge condition + if ( mArray.isEmpty() ) + return false; + + // use the cached version first + if ( previousOverlapSearchIndex != -1 && overlapsAtOrImmediatelyAfterCachedIndex(loc, true) ) + return true; + + // update the cached index + previousOverlapSearchIndex = Collections.binarySearch(mArray, loc); + + // if it matches an interval exactly, we are done + if ( previousOverlapSearchIndex >= 0 ) + return true; + + // check whether it overlaps the interval before or after the insertion point + previousOverlapSearchIndex = Math.max(0, -1 * previousOverlapSearchIndex - 2); + return overlapsAtOrImmediatelyAfterCachedIndex(loc, false); + } + + private boolean overlapsAtOrImmediatelyAfterCachedIndex(final GenomeLoc loc, final boolean updateCachedIndex) { + // check the cached entry + if ( mArray.get(previousOverlapSearchIndex).overlapsP(loc) ) + return true; + + // check the entry after the cached entry since we may have moved to it + boolean returnValue = false; + if ( previousOverlapSearchIndex < mArray.size() - 1 ) { + returnValue = mArray.get(previousOverlapSearchIndex + 1).overlapsP(loc); + if ( updateCachedIndex ) + previousOverlapSearchIndex++; } - return false; + + return returnValue; } /** @@ -155,7 +186,7 @@ public class GenomeLocSortedSet extends AbstractSet { mArray.add(e); return true; } else { - int loc = Collections.binarySearch(mArray,e); + final int loc = Collections.binarySearch(mArray,e); if (loc >= 0) { throw new ReviewedStingException("Genome Loc Sorted Set already contains the GenomicLoc " + e.toString()); } else { diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index b30d47074..30fdce75d 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -49,7 +49,9 @@ public class Haplotype { private int alignmentStartHapwrtRef; public int leftBreakPoint = 0; public int rightBreakPoint = 0; - + private Allele artificialAllele = null; + private int artificialAllelePosition = -1; + /** * Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual * @@ -71,6 +73,12 @@ public class Haplotype { this(bases, 0); } + protected Haplotype( final byte[] bases, final Allele artificialAllele, final int artificialAllelePosition ) { + this(bases, 0); + this.artificialAllele = artificialAllele; + this.artificialAllelePosition = artificialAllelePosition; + } + public Haplotype( final byte[] bases, final GenomeLoc loc ) { this(bases); this.genomeLocation = loc; @@ -171,8 +179,25 @@ public class Haplotype { this.cigar = cigar; } + public boolean isArtificialHaplotype() { + return artificialAllele != null; + } + + public Allele getArtificialAllele() { + return artificialAllele; + } + + public int getArtificialAllelePosition() { + return artificialAllelePosition; + } + + public void setArtificialAllele(final Allele artificialAllele, final int artificialAllelePosition) { + this.artificialAllele = artificialAllele; + this.artificialAllelePosition = artificialAllelePosition; + } + @Requires({"refInsertLocation >= 0"}) - public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation ) { + public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation, final int genomicInsertLocation ) { // refInsertLocation is in ref haplotype offset coordinates NOT genomic coordinates final int haplotypeInsertLocation = ReadUtils.getReadCoordinateForReferenceCoordinate(alignmentStartHapwrtRef, cigar, refInsertLocation, ReadUtils.ClippingTail.RIGHT_TAIL, true); if( haplotypeInsertLocation == -1 || haplotypeInsertLocation + refAllele.length() >= bases.length ) { // desired change falls inside deletion so don't bother creating a new haplotype @@ -182,7 +207,7 @@ public class Haplotype { newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, 0, haplotypeInsertLocation)); // bases before the variant newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, altAllele.getBases()); // the alt allele of the variant newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, haplotypeInsertLocation + refAllele.length(), bases.length)); // bases after the variant - return new Haplotype(newHaplotypeBases); + return new Haplotype(newHaplotypeBases, altAllele, genomicInsertLocation); } public static class HaplotypeBaseComparator implements Comparator, Serializable { diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index b780d0966..e4d6f6233 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -293,6 +293,10 @@ public class Utils { } } + public static String join(final String separator, final T ... objects) { + return join(separator, Arrays.asList(objects)); + } + public static String dupString(char c, int nCopies) { char[] chars = new char[nCopies]; Arrays.fill(chars, c); @@ -701,11 +705,13 @@ public class Utils { List oldRecords = header.getProgramRecords(); List newRecords = new ArrayList(oldRecords.size()+1); for ( SAMProgramRecord record : oldRecords ) - if ( !record.getId().startsWith(programRecord.getId()) || KEEP_ALL_PG_RECORDS ) + if ( (programRecord != null && !record.getId().startsWith(programRecord.getId())) || KEEP_ALL_PG_RECORDS ) newRecords.add(record); - newRecords.add(programRecord); - header.setProgramRecords(newRecords); + if (programRecord != null) { + newRecords.add(programRecord); + header.setProgramRecords(newRecords); + } return header; } diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index decc54d47..0d12d53cc 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -1,11 +1,11 @@ package org.broadinstitute.sting.utils.activeregion; -import net.sf.picard.reference.IndexedFastaSequenceFile; -import net.sf.samtools.util.StringUtil; +import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.HasGenomeLocation; import org.broadinstitute.sting.utils.clipping.ReadClipper; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.ArrayList; @@ -54,27 +54,31 @@ public class ActiveRegion implements HasGenomeLocation { public ArrayList getReads() { return reads; } - public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader ) { + @Requires("referenceReader.isUppercasingBases()") + public byte[] getActiveRegionReference( final CachingIndexedFastaSequenceFile referenceReader ) { return getActiveRegionReference(referenceReader, 0); } - public byte[] getActiveRegionReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { + @Requires("referenceReader.isUppercasingBases()") + public byte[] getActiveRegionReference( final CachingIndexedFastaSequenceFile referenceReader, final int padding ) { return getReference( referenceReader, padding, extendedLoc ); } - public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader ) { + @Requires("referenceReader.isUppercasingBases()") + public byte[] getFullReference( final CachingIndexedFastaSequenceFile referenceReader ) { return getFullReference(referenceReader, 0); } - public byte[] getFullReference( final IndexedFastaSequenceFile referenceReader, final int padding ) { + @Requires("referenceReader.isUppercasingBases()") + public byte[] getFullReference( final CachingIndexedFastaSequenceFile referenceReader, final int padding ) { return getReference( referenceReader, padding, fullExtentReferenceLoc ); } - private byte[] getReference( final IndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) { + @Requires("referenceReader.isUppercasingBases()") + private byte[] getReference( final CachingIndexedFastaSequenceFile referenceReader, final int padding, final GenomeLoc genomeLoc ) { final byte[] reference = referenceReader.getSubsequenceAt( genomeLoc.getContig(), Math.max(1, genomeLoc.getStart() - padding), Math.min(referenceReader.getSequenceDictionary().getSequence(genomeLoc.getContig()).getSequenceLength(), genomeLoc.getStop() + padding) ).getBases(); - StringUtil.toUpperCase(reference); return reference; } diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java index 73f3cc487..e96eb843d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java @@ -24,11 +24,11 @@ package org.broadinstitute.sting.utils.activeregion; +import com.google.java.contract.Requires; import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.util.ArrayList; import java.util.Collections; @@ -45,6 +45,7 @@ public class ActivityProfile { final GenomeLocParser parser; final boolean presetRegions; GenomeLoc regionStartLoc = null; + GenomeLoc regionStopLoc = null; final List isActiveList; private static final int FILTER_SIZE = 80; private static final double[] GaussianKernel; @@ -71,19 +72,49 @@ public class ActivityProfile { this.regionStartLoc = regionStartLoc; } - public void add(final GenomeLoc loc, final ActivityProfileResult result) { - if ( loc.size() != 1 ) - throw new ReviewedStingException("Bad add call to ActivityProfile: loc " + loc + " size != 1" ); - isActiveList.add(result); - if( regionStartLoc == null ) { + @Override + public String toString() { + return "ActivityProfile{" + + "start=" + regionStartLoc + + ", stop=" + regionStopLoc + + '}'; + } + + /** + * Add the next ActivityProfileResult to this profile. + * + * Must be contiguous with the previously added result, or an IllegalArgumentException will be thrown + * + * @param result a well-formed ActivityProfileResult result to incorporate into this profile + */ + @Requires("result != null") + public void add(final ActivityProfileResult result) { + final GenomeLoc loc = result.getLoc(); + + if ( regionStartLoc == null ) { regionStartLoc = loc; + regionStopLoc = loc; + } else { + if ( regionStopLoc.getStart() != loc.getStart() - 1 ) + throw new IllegalArgumentException("Bad add call to ActivityProfile: loc " + loc + " not immediate after last loc " + regionStopLoc ); + regionStopLoc = loc; } + + isActiveList.add(result); } public int size() { return isActiveList.size(); } + public boolean isEmpty() { + return isActiveList.isEmpty(); + } + + public boolean hasPresetRegions() { + return presetRegions; + } + /** * Band pass this ActivityProfile, producing a new profile that's band pass filtered * @return a new ActivityProfile that's the band-pass filtered version of this profile @@ -104,14 +135,21 @@ public class ActivityProfile { } iii++; } - final double[] filteredProbArray = new double[activeProbArray.length]; + + final double[] filteredProbArray; if( !presetRegions ) { + // if we aren't using preset regions, actually apply the band pass filter for activeProbArray into filteredProbArray + filteredProbArray = new double[activeProbArray.length]; for( iii = 0; iii < activeProbArray.length; iii++ ) { final double[] kernel = ArrayUtils.subarray(GaussianKernel, Math.max(FILTER_SIZE-iii, 0), Math.min(GaussianKernel.length,FILTER_SIZE + activeProbArray.length - iii)); final double[] activeProbSubArray = ArrayUtils.subarray(activeProbArray, Math.max(0,iii - FILTER_SIZE), Math.min(activeProbArray.length,iii + FILTER_SIZE + 1)); filteredProbArray[iii] = MathUtils.dotProduct(activeProbSubArray, kernel); } + } else { + // otherwise we simply use the activeProbArray directly + filteredProbArray = activeProbArray; } + iii = 0; for( final double prob : filteredProbArray ) { final ActivityProfileResult result = isActiveList.get(iii++); @@ -119,6 +157,7 @@ public class ActivityProfile { result.resultState = ActivityProfileResult.ActivityProfileResultState.NONE; result.resultValue = null; } + return new ActivityProfile(parser, presetRegions, isActiveList, regionStartLoc); } @@ -166,6 +205,7 @@ public class ActivityProfile { private final List createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize) { return createActiveRegion(isActive, curStart, curEnd, activeRegionExtension, maxRegionSize, new ArrayList()); } + private final List createActiveRegion(final boolean isActive, final int curStart, final int curEnd, final int activeRegionExtension, final int maxRegionSize, final List returnList) { if( !isActive || curEnd - curStart < maxRegionSize ) { final GenomeLoc loc = parser.createGenomeLoc(regionStartLoc.getContig(), regionStartLoc.getStart() + curStart, regionStartLoc.getStart() + curEnd); diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileResult.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileResult.java index 8dc29aa3c..273c2e785 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileResult.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfileResult.java @@ -1,12 +1,16 @@ package org.broadinstitute.sting.utils.activeregion; +import com.google.java.contract.Ensures; +import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.GenomeLoc; + /** * Created with IntelliJ IDEA. * User: rpoplin * Date: 7/27/12 */ - public class ActivityProfileResult { + private GenomeLoc loc; public double isActiveProb; public ActivityProfileResultState resultState; public Number resultValue; @@ -16,16 +20,52 @@ public class ActivityProfileResult { HIGH_QUALITY_SOFT_CLIPS } - public ActivityProfileResult( final double isActiveProb ) { - this.isActiveProb = isActiveProb; - this.resultState = ActivityProfileResultState.NONE; - this.resultValue = null; + /** + * Create a new ActivityProfileResult at loc with probability of being active of isActiveProb + * + * @param loc the position of the result profile (for debugging purposes) + * @param isActiveProb the probability of being active (between 0 and 1) + */ + @Requires({"loc != null", "isActiveProb >= 0.0 && isActiveProb <= 1.0"}) + public ActivityProfileResult( final GenomeLoc loc, final double isActiveProb ) { + this(loc, isActiveProb, ActivityProfileResultState.NONE, null); } - public ActivityProfileResult( final double isActiveProb, final ActivityProfileResultState resultState, final Number resultValue ) { + /** + * Create a new ActivityProfileResult at loc with probability of being active of isActiveProb that maintains some + * information about the result state and value (TODO RYAN -- what do these mean?) + * + * @param loc the position of the result profile (for debugging purposes) + * @param isActiveProb the probability of being active (between 0 and 1) + */ + @Requires({"loc != null", "isActiveProb >= 0.0 && isActiveProb <= 1.0"}) + public ActivityProfileResult( final GenomeLoc loc, final double isActiveProb, final ActivityProfileResultState resultState, final Number resultValue ) { + // make sure the location of that activity profile is 1 + if ( loc.size() != 1 ) + throw new IllegalArgumentException("Location for an ActivityProfileResult must have to size 1 bp but saw " + loc); + + this.loc = loc; this.isActiveProb = isActiveProb; this.resultState = resultState; this.resultValue = resultValue; } + /** + * Get the genome loc associated with the ActivityProfileResult + * @return the location of this result + */ + @Ensures("result != null") + public GenomeLoc getLoc() { + return loc; + } + + @Override + public String toString() { + return "ActivityProfileResult{" + + "loc=" + loc + + ", isActiveProb=" + isActiveProb + + ", resultState=" + resultState + + ", resultValue=" + resultValue + + '}'; + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java index 043e5e185..652f7f96f 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java @@ -587,7 +587,7 @@ public abstract class AbstractVCFCodec extends AsciiFeatureCodec int nParts = ParsingUtils.split(str, genotypeParts, VCFConstants.FIELD_SEPARATOR_CHAR); if ( nParts != genotypeParts.length ) - generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records", lineNo); + generateException("there are " + (nParts-1) + " genotypes while the header requires that " + (genotypeParts.length-1) + " genotypes be present for all records at " + chr + ":" + pos, lineNo); ArrayList genotypes = new ArrayList(nParts); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java index be87e7306..a8aefb703 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java @@ -30,12 +30,17 @@ import net.sf.samtools.SAMSequenceRecord; import org.apache.commons.io.FilenameUtils; import org.apache.log4j.Logger; import org.broad.tribble.Feature; +import org.broad.tribble.FeatureCodecHeader; +import org.broad.tribble.readers.PositionalBufferedStream; import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; import java.util.*; /** @@ -317,4 +322,33 @@ public class VCFUtils { assembly = "hg19"; return assembly; } + + /** + * Read all of the VCF records from source into memory, returning the header and the VariantContexts + * + * @param source the file to read, must be in VCF4 format + * @return + * @throws IOException + */ + public static Pair> readVCF(final File source) throws IOException { + // read in the features + final List vcs = new ArrayList(); + final VCFCodec codec = new VCFCodec(); + PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source)); + FeatureCodecHeader header = codec.readHeader(pbs); + pbs.close(); + + pbs = new PositionalBufferedStream(new FileInputStream(source)); + pbs.skip(header.getHeaderEnd()); + + final VCFHeader vcfHeader = (VCFHeader)header.getHeaderValue(); + + while ( ! pbs.isDone() ) { + final VariantContext vc = codec.decode(pbs); + if ( vc != null ) + vcs.add(vc); + } + + return new Pair>(vcfHeader, vcs); + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java index db54851dd..0e8a3ea70 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java +++ b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java @@ -29,6 +29,7 @@ import net.sf.picard.reference.FastaSequenceIndex; import net.sf.picard.reference.IndexedFastaSequenceFile; import net.sf.picard.reference.ReferenceSequence; import net.sf.samtools.SAMSequenceRecord; +import net.sf.samtools.util.StringUtil; import org.apache.log4j.Priority; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -40,6 +41,8 @@ import java.util.Arrays; * A caching version of the IndexedFastaSequenceFile that avoids going to disk as often as the raw indexer. * * Thread-safe! Uses a thread-local cache + * + * Automatically upper-cases the bases coming in, unless they the flag preserveCase is explicitly set */ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { protected static final org.apache.log4j.Logger logger = org.apache.log4j.Logger.getLogger(CachingIndexedFastaSequenceFile.class); @@ -54,10 +57,15 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { public static final long DEFAULT_CACHE_SIZE = 1000000; /** The cache size of this CachingIndexedFastaSequenceFile */ - final long cacheSize; + private final long cacheSize; /** When we have a cache miss at position X, we load sequence from X - cacheMissBackup */ - final long cacheMissBackup; + private final long cacheMissBackup; + + /** + * If true, we will preserve the case of the original base in the genome, not + */ + private final boolean preserveCase; // information about checking efficiency long cacheHits = 0; @@ -84,37 +92,17 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { /** * Same as general constructor but allows one to override the default cacheSize * - * @param fasta - * @param index - * @param cacheSize + * @param fasta the file we will read our FASTA sequence from. + * @param index the index of the fasta file, used for efficient random access + * @param cacheSize the size in bp of the cache we will use for this reader + * @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case */ - public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize) { + public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize, final boolean preserveCase) { super(fasta, index); if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0"); this.cacheSize = cacheSize; this.cacheMissBackup = Math.max(cacheSize / 1000, 1); - } - - /** - * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. - * - * @param fasta The file to open. - * @param index Pre-built FastaSequenceIndex, for the case in which one does not exist on disk. - * @throws java.io.FileNotFoundException If the fasta or any of its supporting files cannot be found. - */ - public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index) { - this(fasta, index, DEFAULT_CACHE_SIZE); - } - - /** - * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. - * - * Looks for a index file for fasta on disk - * - * @param fasta The file to open. - */ - public CachingIndexedFastaSequenceFile(final File fasta) throws FileNotFoundException { - this(fasta, DEFAULT_CACHE_SIZE); + this.preserveCase = preserveCase; } /** @@ -124,12 +112,76 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { * Uses provided cacheSize instead of the default * * @param fasta The file to open. + * @param cacheSize the size of the cache to use in this CachingIndexedFastaReader, must be >= 0 + * @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case */ - public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize ) throws FileNotFoundException { + public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize, final boolean preserveCase ) throws FileNotFoundException { super(fasta); if ( cacheSize < 0 ) throw new IllegalArgumentException("cacheSize must be > 0"); this.cacheSize = cacheSize; this.cacheMissBackup = Math.max(cacheSize / 1000, 1); + this.preserveCase = preserveCase; + } + +// /** +// * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. +// * +// * @param fasta The file to open. +// * @param index Pre-built FastaSequenceIndex, for the case in which one does not exist on disk. +// * @throws java.io.FileNotFoundException If the fasta or any of its supporting files cannot be found. +// */ +// public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index) { +// this(fasta, index, DEFAULT_CACHE_SIZE); +// } + + /** + * Same as general constructor but allows one to override the default cacheSize + * + * By default, this CachingIndexedFastaReader converts all incoming bases to upper case + * + * @param fasta the file we will read our FASTA sequence from. + * @param index the index of the fasta file, used for efficient random access + * @param cacheSize the size in bp of the cache we will use for this reader + */ + public CachingIndexedFastaSequenceFile(final File fasta, final FastaSequenceIndex index, final long cacheSize) { + this(fasta, index, cacheSize, false); + } + + /** + * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. + * + * Looks for a index file for fasta on disk. + * This CachingIndexedFastaReader will convert all FASTA bases to upper cases under the hood + * + * @param fasta The file to open. + */ + public CachingIndexedFastaSequenceFile(final File fasta) throws FileNotFoundException { + this(fasta, false); + } + + /** + * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. + * + * Looks for a index file for fasta on disk + * + * @param fasta The file to open. + * @param preserveCase If true, we will keep the case of the underlying bases in the FASTA, otherwise everything is converted to upper case + */ + public CachingIndexedFastaSequenceFile(final File fasta, final boolean preserveCase) throws FileNotFoundException { + this(fasta, DEFAULT_CACHE_SIZE, preserveCase); + } + + /** + * Open the given indexed fasta sequence file. Throw an exception if the file cannot be opened. + * + * Looks for a index file for fasta on disk + * Uses provided cacheSize instead of the default + * + * @param fasta The file to open. + * @param cacheSize the size of the cache to use in this CachingIndexedFastaReader, must be >= 0 + */ + public CachingIndexedFastaSequenceFile(final File fasta, final long cacheSize ) throws FileNotFoundException { + this(fasta, cacheSize, false); } /** @@ -168,6 +220,25 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { return cacheSize; } + /** + * Is this CachingIndexedFastaReader keeping the original case of bases in the fasta, or is + * everything being made upper case? + * + * @return true if the bases coming from this reader are in the original case in the fasta, false if they are all upper cased + */ + public boolean isPreservingCase() { + return preserveCase; + } + + /** + * Is uppercasing bases? + * + * @return true if bases coming from this CachingIndexedFastaSequenceFile are all upper cased, false if this reader are in the original case in the fasta + */ + public boolean isUppercasingBases() { + return ! isPreservingCase(); + } + /** * Gets the subsequence of the contig in the range [start,stop] * @@ -177,8 +248,10 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { * @param contig Contig whose subsequence to retrieve. * @param start inclusive, 1-based start of region. * @param stop inclusive, 1-based stop of region. - * @return The partial reference sequence associated with this range. + * @return The partial reference sequence associated with this range. If preserveCase is false, then + * all of the bases in the ReferenceSequence returned by this method will be upper cased. */ + @Override public ReferenceSequence getSubsequenceAt( final String contig, final long start, final long stop ) { final ReferenceSequence result; final Cache myCache = cache.get(); @@ -186,6 +259,7 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { if ( (stop - start) >= cacheSize ) { cacheMisses++; result = super.getSubsequenceAt(contig, start, stop); + if ( ! preserveCase ) StringUtil.toUpperCase(result.getBases()); } else { // todo -- potential optimization is to check if contig.name == contig, as this in generally will be true SAMSequenceRecord contigInfo = super.getSequenceDictionary().getSequence(contig); @@ -198,7 +272,9 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { myCache.start = Math.max(start - cacheMissBackup, 0); myCache.stop = Math.min(start + cacheSize + cacheMissBackup, contigInfo.getSequenceLength()); myCache.seq = super.getSubsequenceAt(contig, myCache.start, myCache.stop); - //System.out.printf("New cache at %s %d-%d%n", contig, cacheStart, cacheStop); + + // convert all of the bases in the sequence to upper case if we aren't preserving cases + if ( ! preserveCase ) StringUtil.toUpperCase(myCache.seq.getBases()); } else { cacheHits++; } @@ -215,8 +291,10 @@ public class CachingIndexedFastaSequenceFile extends IndexedFastaSequenceFile { } } + // for debugging -- print out our efficiency if requested if ( PRINT_EFFICIENCY && (getCacheHits() + getCacheMisses()) % PRINT_FREQUENCY == 0 ) printEfficiency(Priority.INFO); + return result; } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java index ed6fc46bb..42938d2a6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java @@ -254,19 +254,32 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker) pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); for (final String sample : tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); - AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements).getOverlappingFragmentFilteredPileup(); + AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements).getOverlappingFragmentFilteredPileup(discardDiscordant, baseQualNotMapQual); filteredTracker.addElements(sample, pileup.pileupElementTracker); } return (RBP) createNewPileup(loc, filteredTracker); @@ -284,11 +297,16 @@ public abstract class AbstractReadBackedPileup sortedElements = new TreeSet(new Comparator() { + @Override + public int compare(PE element1, PE element2) { + final int difference = element1.getRead().getAlignmentStart() - element2.getRead().getAlignmentStart(); + return difference != 0 ? difference : element1.getRead().getReadName().compareTo(element2.getRead().getReadName()); + } + }); + + if (pileupElementTracker instanceof PerSamplePileupElementTracker) { + PerSamplePileupElementTracker tracker = (PerSamplePileupElementTracker) pileupElementTracker; + + for (final String sample : tracker.getSamples()) { + PileupElementTracker perSampleElements = tracker.getElements(sample); + for (PE pile : perSampleElements) + sortedElements.add(pile); + } + } + else { + UnifiedPileupElementTracker tracker = (UnifiedPileupElementTracker) pileupElementTracker; + for (PE pile : tracker) + sortedElements.add(pile); + } + + UnifiedPileupElementTracker sortedTracker = new UnifiedPileupElementTracker(); + for (PE pile : sortedElements) + sortedTracker.add(pile); + + return (RBP) createNewPileup(loc, sortedTracker); + } + @Override public FragmentCollection toFragments() { return FragmentUtils.create(this); diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java index f15468840..be61bad99 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java @@ -60,6 +60,16 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca */ public ReadBackedPileup getOverlappingFragmentFilteredPileup(); + /** + * Returns a new ReadBackedPileup where only one read from an overlapping read + * pair is retained. If discardDiscordant and the two reads in question disagree to their basecall, + * neither read is retained. Otherwise, the read with the higher + * quality (base or mapping, depending on baseQualNotMapQual) observation is retained + * + * @return the newly filtered pileup + */ + public ReadBackedPileup getOverlappingFragmentFilteredPileup(boolean discardDiscordant, boolean baseQualNotMapQual); + /** * Returns a new ReadBackedPileup that is free of mapping quality zero reads in this pileup. Note that this * does not copy the data, so both ReadBackedPileups should not be changed. Doesn't make an unnecessary copy @@ -261,6 +271,13 @@ public interface ReadBackedPileup extends Iterable, HasGenomeLoca */ public byte[] getMappingQuals(); + /** + * Returns a new ReadBackedPileup that is sorted by start coordinate of the reads. + * + * @return + */ + public ReadBackedPileup getStartSortedPileup(); + /** * Converts this pileup into a FragmentCollection (see FragmentUtils for documentation) * @return diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java index e3348d3de..207988749 100755 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java @@ -199,7 +199,7 @@ public class RecalDatum { @Override public String toString() { - return String.format("%.2f,%,2f,%.2f", getNumObservations(), getNumMismatches(), getEmpiricalQuality()); + return String.format("%.2f,%.2f,%.2f", getNumObservations(), getNumMismatches(), getEmpiricalQuality()); } public String stringForCSV() { diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java index 5d0d94b69..a9b6c7152 100755 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/covariates/CycleCovariate.java @@ -49,7 +49,7 @@ import java.util.EnumSet; public class CycleCovariate implements StandardCovariate { - private static final int MAXIMUM_CYCLE_VALUE = 1000; + private int MAXIMUM_CYCLE_VALUE; private static final int CUSHION_FOR_INDELS = 4; private String default_platform = null; @@ -59,6 +59,8 @@ public class CycleCovariate implements StandardCovariate { // Initialize any member variables using the command-line arguments passed to the walkers @Override public void initialize(final RecalibrationArgumentCollection RAC) { + this.MAXIMUM_CYCLE_VALUE = RAC.MAXIMUM_CYCLE_VALUE; + if (RAC.DEFAULT_PLATFORM != null && !NGSPlatform.isKnown(RAC.DEFAULT_PLATFORM)) throw new UserException.CommandLineException("The requested default platform (" + RAC.DEFAULT_PLATFORM + ") is not a recognized platform."); @@ -88,6 +90,9 @@ public class CycleCovariate implements StandardCovariate { final int MAX_CYCLE_FOR_INDELS = readLength - CUSHION_FOR_INDELS - 1; for (int i = 0; i < readLength; i++) { + if ( cycle > MAXIMUM_CYCLE_VALUE ) + throw new UserException("The maximum allowed value for the cycle is " + MAXIMUM_CYCLE_VALUE + ", but a larger cycle was detected in read " + read.getReadName() + ". Please use the --maximum_cycle_value argument to increase this value (at the expense of requiring more memory to run)"); + final int substitutionKey = keyFromCycle(cycle); final int indelKey = (i < CUSHION_FOR_INDELS || i > MAX_CYCLE_FOR_INDELS) ? -1 : substitutionKey; values.addCovariate(substitutionKey, indelKey, indelKey, i); diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index 4f1e66ba2..585578958 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -646,7 +646,7 @@ public class AlignmentUtils { // get the indel element and move it left one base CigarElement ce = cigar.getCigarElement(indexOfIndel - 1); - elements.add(new CigarElement(ce.getLength() - 1, ce.getOperator())); + elements.add(new CigarElement(Math.max(ce.getLength() - 1, 0), ce.getOperator())); elements.add(cigar.getCigarElement(indexOfIndel)); if (indexOfIndel + 1 < cigar.numCigarElements()) { ce = cigar.getCigarElement(indexOfIndel + 1); diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java index 27a5b0c24..12f9cb20c 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java @@ -184,9 +184,6 @@ public class VariantContext implements Feature { // to enable tribble integratio protected CommonInfo commonInfo = null; public final static double NO_LOG10_PERROR = CommonInfo.NO_LOG10_PERROR; - @Deprecated // ID is no longer stored in the attributes map - private final static String ID_KEY = "ID"; - public final static Set PASSES_FILTERS = Collections.unmodifiableSet(new LinkedHashSet()); /** The location of this VariantContext */ @@ -287,10 +284,6 @@ public class VariantContext implements Feature { // to enable tribble integratio this.commonInfo = new CommonInfo(source, log10PError, filters, attributes); - // todo -- remove me when this check is no longer necessary - if ( this.commonInfo.hasAttribute(ID_KEY) ) - throw new IllegalArgumentException("Trying to create a VariantContext with a ID key. Please use provided constructor argument ID"); - if ( alleles == null ) { throw new IllegalArgumentException("Alleles cannot be null"); } // we need to make this a LinkedHashSet in case the user prefers a given ordering of alleles diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 81959c998..1f1867f75 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -979,6 +979,40 @@ public class VariantContextUtils { private static final List NO_CALL_ALLELES = Arrays.asList(Allele.NO_CALL, Allele.NO_CALL); public static final double SUM_GL_THRESH_NOCALL = -0.1; // if sum(gl) is bigger than this threshold, we treat GL's as non-informative and will force a no-call. + /** + * Split variant context into its biallelic components if there are more than 2 alleles + * + * For VC has A/B/C alleles, returns A/B and A/C contexts. + * Genotypes are all no-calls now (it's not possible to fix them easily) + * Alleles are right trimmed to satisfy VCF conventions + * + * If vc is biallelic or non-variant it is just returned + * + * Chromosome counts are updated (but they are by definition 0) + * + * @param vc a potentially multi-allelic variant context + * @return a list of bi-allelic (or monomorphic) variant context + */ + public static List splitVariantContextToBiallelics(final VariantContext vc) { + if ( ! vc.isVariant() || vc.isBiallelic() ) + // non variant or biallelics already satisfy the contract + return Collections.singletonList(vc); + else { + final List biallelics = new LinkedList(); + + for ( final Allele alt : vc.getAlternateAlleles() ) { + VariantContextBuilder builder = new VariantContextBuilder(vc); + final List alleles = Arrays.asList(vc.getReference(), alt); + builder.alleles(alleles); + builder.genotypes(VariantContextUtils.subsetDiploidAlleles(vc, alleles, false)); + calculateChromosomeCounts(builder, true); + biallelics.add(reverseTrimAlleles(builder.make())); + } + + return biallelics; + } + } + /** * subset the Variant Context to the specific set of alleles passed in (pruning the PLs appropriately) * diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java index 7b8224568..9c63a69e7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/writer/BCF2FieldWriterManager.java @@ -76,7 +76,6 @@ public class BCF2FieldWriterManager { if ( map.containsKey(field) ) throw new ReviewedStingException("BUG: field " + field + " already seen in VCFHeader while building BCF2 field encoders"); map.put(field, writer); - if ( logger.isDebugEnabled() ) logger.debug(writer); } // ----------------------------------------------------------------- diff --git a/public/java/test/org/broadinstitute/sting/TestNGTestTransformer.java b/public/java/test/org/broadinstitute/sting/TestNGTestTransformer.java new file mode 100644 index 000000000..6a1a37de9 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/TestNGTestTransformer.java @@ -0,0 +1,37 @@ +package org.broadinstitute.sting; + +import org.apache.log4j.Logger; +import org.testng.IAnnotationTransformer; +import org.testng.annotations.ITestAnnotation; + +import java.lang.reflect.Constructor; +import java.lang.reflect.Method; + +/** + * Provide default @Test values for GATK testng tests. + * + * Currently only sets the maximum runtime to 10 minutes, if it's not been specified. + * + * See http://beust.com/weblog/2006/10/18/annotation-transformers-in-java/ + * + * @author depristo + * @since 10/31/12 + * @version 0.1 + */ +public class TestNGTestTransformer implements IAnnotationTransformer { + public static final long DEFAULT_TIMEOUT = 1000 * 60 * 10; // 10 minutes max per test + + final static Logger logger = Logger.getLogger(TestNGTestTransformer.class); + + public void transform(ITestAnnotation annotation, + Class testClass, + Constructor testConstructor, + Method testMethod) + { + if ( annotation.getTimeOut() == 0 ) { + logger.warn("test " + testMethod.toString() + " has no specified timeout, adding default timeout " + DEFAULT_TIMEOUT / 1000 / 60 + " minutes"); + annotation.setTimeOut(DEFAULT_TIMEOUT); + } + } +} + diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java new file mode 100644 index 000000000..e4c7b2db0 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsTest.java @@ -0,0 +1,358 @@ +package org.broadinstitute.sting.gatk.traversals; + +import net.sf.samtools.*; +import org.broadinstitute.sting.utils.interval.IntervalMergingRule; +import org.broadinstitute.sting.utils.interval.IntervalUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider; +import org.broadinstitute.sting.gatk.datasources.reads.MockLocusShard; +import org.broadinstitute.sting.gatk.datasources.reads.Shard; +import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.executive.WindowMaker; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; + +/** + * Created with IntelliJ IDEA. + * User: thibault + * Date: 11/13/12 + * Time: 2:47 PM + * + * Test the Active Region Traversal Contract + * http://iwww.broadinstitute.org/gsa/wiki/index.php/Active_Region_Traversal_Contract + */ +public class TraverseActiveRegionsTest extends BaseTest { + + private class DummyActiveRegionWalker extends ActiveRegionWalker { + private final double prob; + protected List isActiveCalls = new ArrayList(); + protected Map mappedActiveRegions = new HashMap(); + + public DummyActiveRegionWalker() { + this.prob = 1.0; + } + + @Override + public ActivityProfileResult isActive(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { + isActiveCalls.add(ref.getLocus()); + return new ActivityProfileResult(ref.getLocus(), prob); + } + + @Override + public Integer map(ActiveRegion activeRegion, RefMetaDataTracker metaDataTracker) { + mappedActiveRegions.put(activeRegion.getLocation(), activeRegion); + return 0; + } + + @Override + public Integer reduceInit() { + return 0; + } + + @Override + public Integer reduce(Integer value, Integer sum) { + return 0; + } + } + + private final TraverseActiveRegions t = new TraverseActiveRegions(); + + private IndexedFastaSequenceFile reference; + private SAMSequenceDictionary dictionary; + private GenomeLocParser genomeLocParser; + + private List intervals; + private List reads; + + @BeforeClass + private void init() throws FileNotFoundException { + reference = new CachingIndexedFastaSequenceFile(new File(hg19Reference)); + dictionary = reference.getSequenceDictionary(); + genomeLocParser = new GenomeLocParser(dictionary); + + intervals = new ArrayList(); + intervals.add(genomeLocParser.createGenomeLoc("1", 10, 20)); + intervals.add(genomeLocParser.createGenomeLoc("1", 1, 999)); + intervals.add(genomeLocParser.createGenomeLoc("1", 1000, 1999)); + intervals.add(genomeLocParser.createGenomeLoc("1", 2000, 2999)); + intervals.add(genomeLocParser.createGenomeLoc("1", 10000, 20000)); + // TODO: this fails! + //intervals.add(genomeLocParser.createGenomeLoc("20", 10000, 20000)); + intervals = IntervalUtils.sortAndMergeIntervals(genomeLocParser, intervals, IntervalMergingRule.OVERLAPPING_ONLY).toList(); + + reads = new ArrayList(); + reads.add(buildSAMRecord("simple", "1", 100, 200)); + reads.add(buildSAMRecord("overlap_equal", "1", 10, 20)); + reads.add(buildSAMRecord("overlap_unequal", "1", 10, 21)); + reads.add(buildSAMRecord("boundary_equal", "1", 1990, 2009)); + reads.add(buildSAMRecord("boundary_unequal", "1", 1995, 2050)); + reads.add(buildSAMRecord("extended_only", "1", 3000, 3100)); + reads.add(buildSAMRecord("extended_and_np", "1", 990, 1990)); + reads.add(buildSAMRecord("outside_intervals", "1", 5000, 6000)); + // TODO + //reads.add(buildSAMRecord("simple20", "20", 10100, 10150)); + } + + @Test + public void testAllBasesSeen() { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); + + List activeIntervals = getIsActiveIntervals(walker, intervals); + // Contract: Every genome position in the analysis interval(s) is processed by the walker's isActive() call + verifyEqualIntervals(intervals, activeIntervals); + + // TODO: more tests and edge cases + } + + private List getIsActiveIntervals(DummyActiveRegionWalker walker, List intervals) { + List activeIntervals = new ArrayList(); + for (LocusShardDataProvider dataProvider : createDataProviders(intervals)) { + t.traverse(walker, dataProvider, 0); + activeIntervals.addAll(walker.isActiveCalls); + } + + return activeIntervals; + } + + @Test + public void testActiveRegionCoverage() { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); + + Collection activeRegions = getActiveRegions(walker, intervals).values(); + verifyActiveRegionCoverage(intervals, activeRegions); + + // TODO: more tests and edge cases + } + + private void verifyActiveRegionCoverage(List intervals, Collection activeRegions) { + List intervalStarts = new ArrayList(); + List intervalStops = new ArrayList(); + + for (GenomeLoc interval : intervals) { + intervalStarts.add(interval.getStartLocation()); + intervalStops.add(interval.getStopLocation()); + } + + Map baseRegionMap = new HashMap(); + + for (ActiveRegion activeRegion : activeRegions) { + for (GenomeLoc activeLoc : toSingleBaseLocs(activeRegion.getLocation())) { + // Contract: Regions do not overlap + Assert.assertFalse(baseRegionMap.containsKey(activeLoc), "Genome location " + activeLoc + " is assigned to more than one region"); + baseRegionMap.put(activeLoc, activeRegion); + } + + GenomeLoc start = activeRegion.getLocation().getStartLocation(); + if (intervalStarts.contains(start)) + intervalStarts.remove(start); + + GenomeLoc stop = activeRegion.getLocation().getStopLocation(); + if (intervalStops.contains(stop)) + intervalStops.remove(stop); + } + + for (GenomeLoc baseLoc : toSingleBaseLocs(intervals)) { + // Contract: Each location in the interval(s) is in exactly one region + // Contract: The total set of regions exactly matches the analysis interval(s) + Assert.assertTrue(baseRegionMap.containsKey(baseLoc), "Genome location " + baseLoc + " is not assigned to any region"); + baseRegionMap.remove(baseLoc); + } + + // Contract: The total set of regions exactly matches the analysis interval(s) + Assert.assertEquals(baseRegionMap.size(), 0, "Active regions contain base(s) outside of the given intervals"); + + // Contract: All explicit interval boundaries must also be region boundaries + Assert.assertEquals(intervalStarts.size(), 0, "Interval start location does not match an active region start location"); + Assert.assertEquals(intervalStops.size(), 0, "Interval stop location does not match an active region stop location"); + } + + @Test + public void testReadMapping() { + DummyActiveRegionWalker walker = new DummyActiveRegionWalker(); + + // Contract: Each read has the Primary state in a single region (or none) + // This is the region of maximum overlap for the read (earlier if tied) + + // Contract: Each read has the Non-Primary state in all other regions it overlaps + // Contract: Each read has the Extended state in regions where it only overlaps if the region is extended + + // simple: Primary in 1:1-999 + // overlap_equal: Primary in 1:1-999 + // overlap_unequal: Primary in 1:1-999 + // boundary_equal: Primary in 1:1000-1999, Non-Primary in 1:2000-2999 + // boundary_unequal: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 + // extended_only: Extended in 1:2000-2999 + // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 + // outside_intervals: none + + // TODO + // simple20: Primary in 20:10000-20000 + + Map activeRegions = getActiveRegions(walker, intervals); + ActiveRegion region; + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1, 999)); + + verifyReadPrimary(region, "simple"); + verifyReadPrimary(region, "overlap_equal"); + verifyReadPrimary(region, "overlap_unequal"); + verifyReadNotPlaced(region, "boundary_equal"); + verifyReadNotPlaced(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_only"); + // TODO: fail verifyReadNonPrimary(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 1000, 1999)); + + verifyReadNotPlaced(region, "simple"); + verifyReadNotPlaced(region, "overlap_equal"); + verifyReadNotPlaced(region, "overlap_unequal"); + // TODO: fail verifyReadPrimary(region, "boundary_equal"); + // TODO: fail verifyReadNonPrimary(region, "boundary_unequal"); + verifyReadNotPlaced(region, "extended_only"); + // TODO: fail verifyReadPrimary(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); + + verifyReadNotPlaced(region, "simple"); + verifyReadNotPlaced(region, "overlap_equal"); + verifyReadNotPlaced(region, "overlap_unequal"); + // TODO: fail verifyReadNonPrimary(region, "boundary_equal"); + verifyReadPrimary(region, "boundary_unequal"); + // TODO: fail verifyReadExtended(region, "extended_only"); + // TODO: fail verifyReadExtended(region, "extended_and_np"); + verifyReadNotPlaced(region, "outside_intervals"); + + // TODO: more tests and edge cases + } + + private void verifyReadPrimary(ActiveRegion region, String readName) { + SAMRecord read = getRead(region, readName); + Assert.assertFalse(read.getNotPrimaryAlignmentFlag(), "Read " + read + " not primary in active region " + region); + } + + private void verifyReadNonPrimary(ActiveRegion region, String readName) { + SAMRecord read = getRead(region, readName); + Assert.assertTrue(read.getNotPrimaryAlignmentFlag(), "Read " + read + " primary in active region " + region); + } + + private void verifyReadExtended(ActiveRegion region, String readName) { + Assert.fail("The Extended read state has not been implemented"); + } + + private void verifyReadNotPlaced(ActiveRegion region, String readName) { + for (SAMRecord read : region.getReads()) { + if (read.getReadName().equals(readName)) + Assert.fail("Read " + readName + " found in active region " + region); + } + } + + private SAMRecord getRead(ActiveRegion region, String readName) { + for (SAMRecord read : region.getReads()) { + if (read.getReadName().equals(readName)) + return read; + } + + Assert.fail("Read " + readName + " not found in active region " + region); + return null; + } + + private Map getActiveRegions(DummyActiveRegionWalker walker, List intervals) { + for (LocusShardDataProvider dataProvider : createDataProviders(intervals)) + t.traverse(walker, dataProvider, 0); + + return walker.mappedActiveRegions; + } + + private Collection toSingleBaseLocs(GenomeLoc interval) { + List bases = new ArrayList(); + if (interval.size() == 1) + bases.add(interval); + else { + for (int location = interval.getStart(); location <= interval.getStop(); location++) + bases.add(genomeLocParser.createGenomeLoc(interval.getContig(), location, location)); + } + + return bases; + } + + private Collection toSingleBaseLocs(List intervals) { + Set bases = new TreeSet(); // for sorting and uniqueness + for (GenomeLoc interval : intervals) + bases.addAll(toSingleBaseLocs(interval)); + + return bases; + } + + private void verifyEqualIntervals(List aIntervals, List bIntervals) { + Collection aBases = toSingleBaseLocs(aIntervals); + Collection bBases = toSingleBaseLocs(bIntervals); + + Assert.assertTrue(aBases.size() == bBases.size(), "Interval lists have a differing number of bases: " + aBases.size() + " vs. " + bBases.size()); + + Iterator aIter = aBases.iterator(); + Iterator bIter = bBases.iterator(); + while (aIter.hasNext() && bIter.hasNext()) { + GenomeLoc aLoc = aIter.next(); + GenomeLoc bLoc = bIter.next(); + Assert.assertTrue(aLoc.equals(bLoc), "Interval locations do not match: " + aLoc + " vs. " + bLoc); + } + } + + // copied from LocusViewTemplate + protected GATKSAMRecord buildSAMRecord(String readName, String contig, int alignmentStart, int alignmentEnd) { + SAMFileHeader header = new SAMFileHeader(); + header.setSequenceDictionary(dictionary); + GATKSAMRecord record = new GATKSAMRecord(header); + + record.setReadName(readName); + record.setReferenceIndex(dictionary.getSequenceIndex(contig)); + record.setAlignmentStart(alignmentStart); + + Cigar cigar = new Cigar(); + int len = alignmentEnd - alignmentStart + 1; + cigar.add(new CigarElement(len, CigarOperator.M)); + record.setCigar(cigar); + record.setReadBases(new byte[len]); + record.setBaseQualities(new byte[len]); + + return record; + } + + private List createDataProviders(List intervals) { + GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); + engine.setGenomeLocParser(genomeLocParser); + t.initialize(engine); + + StingSAMIterator iterator = ArtificialSAMUtils.createReadIterator(reads); + Shard shard = new MockLocusShard(genomeLocParser, intervals); + + List providers = new ArrayList(); + for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, iterator, shard.getGenomeLocs())) { + providers.add(new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList())); + } + + return providers; + } +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index 01dff0089..b097e3d34 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -151,7 +151,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { } @Test - public void testTabixAnnotations() { + public void testTabixAnnotationsAndParallelism() { final String MD5 = "99938d1e197b8f10c408cac490a00a62"; for ( String file : Arrays.asList("CEU.exon.2010_03.sites.vcf", "CEU.exon.2010_03.sites.vcf.gz")) { WalkerTestSpec spec = new WalkerTestSpec( @@ -159,6 +159,12 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { Arrays.asList(MD5)); executeTest("Testing lookup vcf tabix vs. vcf tribble", spec); } + + WalkerTestSpec spec = new WalkerTestSpec( + baseTestString() + " -A HomopolymerRun -nt 2 --variant:vcf " + validationDataLocation + "CEU.exon.2010_03.sites.vcf -L " + validationDataLocation + "CEU.exon.2010_03.sites.vcf --no_cmdline_in_header", 1, + Arrays.asList(MD5)); + + executeTest("Testing lookup vcf tabix vs. vcf tribble plus parallelism", spec); } @Test diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalkerIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalkerIntegrationTest.java index c9e91e664..ec69a893c 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalkerIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalkerIntegrationTest.java @@ -38,7 +38,7 @@ public class CallableLociWalkerIntegrationTest extends WalkerTest { public void testCallableLociWalkerBed() { String gatk_args = commonArgs + " -format BED -L 1:10,000,000-11,000,000 -summary %s"; WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2, - Arrays.asList("9e4ec9c23f21a8162d27a39ab057398c", SUMMARY_MD5)); + Arrays.asList("42e86c06c167246a28bffdacaca75ffb", SUMMARY_MD5)); executeTest("formatBed", spec); } @@ -46,7 +46,7 @@ public class CallableLociWalkerIntegrationTest extends WalkerTest { public void testCallableLociWalkerPerBase() { String gatk_args = commonArgs + " -format STATE_PER_BASE -L 1:10,000,000-11,000,000 -summary %s"; WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2, - Arrays.asList("e6044b4495ef24f542403e6a94437068", SUMMARY_MD5)); + Arrays.asList("d66c525d9c70f62df8156261d3e535ad", SUMMARY_MD5)); executeTest("format_state_per_base", spec); } @@ -54,7 +54,7 @@ public class CallableLociWalkerIntegrationTest extends WalkerTest { public void testCallableLociWalker2() { String gatk_args = commonArgs + " -format BED -L 1:10,000,000-10,000,100 -L 1:10,000,110-10,000,120 -summary %s"; WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2, - Arrays.asList("c671f65712d9575b8b3e1f1dbedc146e", "d287510eac04acf5a56f5cde2cba0e4a")); + Arrays.asList("330f476085533db92a9dbdb3a127c041", "d287510eac04acf5a56f5cde2cba0e4a")); executeTest("formatBed by interval", spec); } @@ -62,7 +62,7 @@ public class CallableLociWalkerIntegrationTest extends WalkerTest { public void testCallableLociWalker3() { String gatk_args = commonArgs + " -format BED -L 1:10,000,000-11,000,000 -minDepth 10 -maxDepth 100 --minBaseQuality 10 --minMappingQuality 20 -summary %s"; WalkerTestSpec spec = new WalkerTestSpec(gatk_args, 2, - Arrays.asList("b7d26a470ef906590249f2fa45fd6bdd", "da431d393f7c2b2b3e27556b86c1dbc7")); + Arrays.asList("46a53379aaaf9803276a0a34b234f6ab", "da431d393f7c2b2b3e27556b86c1dbc7")); executeTest("formatBed lots of arguments", spec); } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperLargeScaleTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperLargeScaleTest.java index 109088875..c5a5dcc21 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperLargeScaleTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperLargeScaleTest.java @@ -7,7 +7,7 @@ import java.util.ArrayList; public class UnifiedGenotyperLargeScaleTest extends WalkerTest { - @Test + @Test( timeOut = 18000000 ) public void testUnifiedGenotyperWholeGenome() { WalkerTestSpec spec = new WalkerTestSpec( "-R " + hg18Reference + @@ -22,7 +22,7 @@ public class UnifiedGenotyperLargeScaleTest extends WalkerTest { executeTest("testUnifiedGenotyperWholeGenome", spec); } - @Test + @Test( timeOut = 18000000 ) public void testUnifiedGenotyperWholeExome() { WalkerTestSpec spec = new WalkerTestSpec( "-R " + hg18Reference + @@ -37,7 +37,7 @@ public class UnifiedGenotyperLargeScaleTest extends WalkerTest { executeTest("testUnifiedGenotyperWholeExome", spec); } - @Test + @Test( timeOut = 18000000 ) public void testUnifiedGenotyperWGParallel() { WalkerTestSpec spec = new WalkerTestSpec( "-R " + hg18Reference + diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperLiteIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperLiteIntegrationTest.java new file mode 100755 index 000000000..783a8d7fc --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperLiteIntegrationTest.java @@ -0,0 +1,35 @@ +package org.broadinstitute.sting.gatk.walkers.genotyper; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.utils.classloader.GATKLiteUtils; +import org.testng.SkipException; +import org.testng.annotations.Test; + +import java.util.Arrays; + +// ********************************************************************************** // +// Note that this class also serves as an integration test for the VariantAnnotator! // +// ********************************************************************************** // + +public class UnifiedGenotyperLiteIntegrationTest extends WalkerTest { + + private final static String baseCommand = "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -minIndelFrac 0.0 --dbsnp " + b36dbSNP129; + + // -------------------------------------------------------------------------------------------------------------- + // + // testing contamination down-sampling gets ignored + // + // -------------------------------------------------------------------------------------------------------------- + + @Test + public void testContaminationDownsampling() { + if ( !GATKLiteUtils.isGATKLite() ) + throw new SkipException("Only want to test for GATK lite"); + + WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( + baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, + Arrays.asList("9addd225a985178339a0c49dc5fdc220")); + executeTest("test contamination_percentage_to_filter gets ignored", spec); + } + +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerLargeScaleTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerLargeScaleTest.java index 4526fc0d7..2dd5a66fd 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerLargeScaleTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/IndelRealignerLargeScaleTest.java @@ -6,7 +6,7 @@ import org.testng.annotations.Test; import java.util.ArrayList; public class IndelRealignerLargeScaleTest extends WalkerTest { - @Test + @Test( timeOut = 18000000 ) public void testHighCoverage() { WalkerTestSpec spec = new WalkerTestSpec( @@ -21,7 +21,7 @@ public class IndelRealignerLargeScaleTest extends WalkerTest { executeTest("testIndelRealignerHighCoverage", spec); } - @Test + @Test( timeOut = 18000000 ) public void testRealigner() { WalkerTestSpec spec1 = new WalkerTestSpec( diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorLargeScaleTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorLargeScaleTest.java index 3203ee100..e32afd06b 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorLargeScaleTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/indels/RealignerTargetCreatorLargeScaleTest.java @@ -6,7 +6,7 @@ import org.testng.annotations.Test; import java.util.ArrayList; public class RealignerTargetCreatorLargeScaleTest extends WalkerTest { - @Test + @Test( timeOut = 18000000 ) public void testRealignerTargetCreator() { WalkerTestSpec spec1 = new WalkerTestSpec( diff --git a/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java index 3d21e654f..6138e7396 100755 --- a/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/GenomeLocSortedSetUnitTest.java @@ -6,6 +6,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertTrue; import org.testng.annotations.BeforeClass; @@ -117,6 +118,41 @@ public class GenomeLocSortedSetUnitTest extends BaseTest { assertTrue(loc.getContigIndex() == 1); } + @Test + public void overlap() { + for ( int i = 1; i < 6; i++ ) { + final int start = i * 10; + mSortedSet.add(genomeLocParser.createGenomeLoc(contigOneName, start, start + 1)); + } + + // test matches in and around interval + assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 9, 9))); + assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 10, 10))); + assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 11, 11))); + assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 12, 12))); + + // test matches spanning intervals + assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 14, 20))); + assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 11, 15))); + assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 30, 40))); + assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 51, 53))); + + // test miss + assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 12, 19))); + + // test exact match after miss + assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 40, 41))); + + // test matches at beginning of intervals + assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 5, 6))); + assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 0, 10))); + + // test matches at end of intervals + assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 52, 53))); + assertTrue(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 51, 53))); + assertFalse(mSortedSet.overlaps(genomeLocParser.createGenomeLoc(contigOneName, 52, 53))); + } + @Test public void mergingOverlappingAbove() { GenomeLoc e = genomeLocParser.createGenomeLoc(contigOneName, 0, 50); diff --git a/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java index ddffb6e4c..13db1d39e 100644 --- a/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java @@ -159,7 +159,7 @@ public class HaplotypeUnitTest extends BaseTest { final VariantContext vc = new VariantContextBuilder().alleles(alleles).loc("1", loc, loc + h1refAllele.getBases().length - 1).make(); h.setAlignmentStartHapwrtRef(0); h.setCigar(cigar); - final Haplotype h1 = h.insertAllele(vc.getReference(), vc.getAlternateAllele(0), loc); + final Haplotype h1 = h.insertAllele(vc.getReference(), vc.getAlternateAllele(0), loc, vc.getStart()); final Haplotype h1expected = new Haplotype(newHap.getBytes()); Assert.assertEquals(h1, h1expected); } diff --git a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java index f7c564c74..57dd19888 100644 --- a/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/activeregion/ActivityProfileUnitTest.java @@ -123,7 +123,7 @@ public class ActivityProfileUnitTest extends BaseTest { for ( int i = 0; i < cfg.probs.size(); i++ ) { double p = cfg.probs.get(i); GenomeLoc loc = genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart() + i, cfg.regionStart.getStart() + i); - profile.add(loc, new ActivityProfileResult(p)); + profile.add(new ActivityProfileResult(loc, p)); } Assert.assertEquals(profile.regionStartLoc, genomeLocParser.createGenomeLoc(cfg.regionStart.getContig(), cfg.regionStart.getStart(), cfg.regionStart.getStart() )); diff --git a/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java index 736162300..bcd846184 100644 --- a/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java @@ -30,6 +30,7 @@ import java.util.concurrent.Executors; public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { private File simpleFasta = new File(publicTestDir + "/exampleFASTA.fasta"); private static final int STEP_SIZE = 1; + private final static boolean DEBUG = false; //private static final List QUERY_SIZES = Arrays.asList(1); private static final List QUERY_SIZES = Arrays.asList(1, 10, 100); @@ -53,9 +54,9 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { return cacheSizeRequested == -1 ? CachingIndexedFastaSequenceFile.DEFAULT_CACHE_SIZE : cacheSizeRequested; } - @Test(dataProvider = "fastas", enabled = true) + @Test(dataProvider = "fastas", enabled = true && ! DEBUG) public void testCachingIndexedFastaReaderSequential1(File fasta, int cacheSize, int querySize) throws FileNotFoundException { - final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize)); + final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true); SAMSequenceRecord contig = caching.getSequenceDictionary().getSequence(0); logger.warn(String.format("Checking contig %s length %d with cache size %d and query size %d", @@ -64,6 +65,8 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { } private void testSequential(final CachingIndexedFastaSequenceFile caching, final File fasta, final int querySize) throws FileNotFoundException { + Assert.assertTrue(caching.isPreservingCase(), "testSequential only works for case preserving CachingIndexedFastaSequenceFile readers"); + final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta); SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0); @@ -92,10 +95,10 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { } // Tests grabbing sequences around a middle cached value. - @Test(dataProvider = "fastas", enabled = true) + @Test(dataProvider = "fastas", enabled = true && ! DEBUG) public void testCachingIndexedFastaReaderTwoStage(File fasta, int cacheSize, int querySize) throws FileNotFoundException { final IndexedFastaSequenceFile uncached = new IndexedFastaSequenceFile(fasta); - final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize)); + final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true); SAMSequenceRecord contig = uncached.getSequenceDictionary().getSequence(0); @@ -123,11 +126,6 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { @DataProvider(name = "ParallelFastaTest") public Object[][] createParallelFastaTest() { List params = new ArrayList(); -// for ( int nt : Arrays.asList(1, 2, 3) ) { -// for ( int cacheSize : CACHE_SIZES ) { -// params.add(new Object[]{simpleFasta, cacheSize, 10, nt}); -// } -// } for ( File fasta : Arrays.asList(simpleFasta) ) { for ( int cacheSize : CACHE_SIZES ) { @@ -143,9 +141,9 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { } - @Test(dataProvider = "ParallelFastaTest", enabled = true, timeOut = 60000) + @Test(dataProvider = "ParallelFastaTest", enabled = true && ! DEBUG, timeOut = 60000) public void testCachingIndexedFastaReaderParallel(final File fasta, final int cacheSize, final int querySize, final int nt) throws FileNotFoundException, InterruptedException { - final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize)); + final CachingIndexedFastaSequenceFile caching = new CachingIndexedFastaSequenceFile(fasta, getCacheSize(cacheSize), true); logger.warn(String.format("Parallel caching index fasta reader test cacheSize %d querySize %d nt %d", caching.getCacheSize(), querySize, nt)); for ( int iterations = 0; iterations < 1; iterations++ ) { @@ -163,4 +161,49 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { executor.shutdownNow(); } } + + // make sure some bases are lower case and some are upper case + @Test(enabled = true) + public void testMixedCasesInExample() throws FileNotFoundException, InterruptedException { + final IndexedFastaSequenceFile original = new IndexedFastaSequenceFile(new File(exampleFASTA)); + final CachingIndexedFastaSequenceFile casePreserving = new CachingIndexedFastaSequenceFile(new File(exampleFASTA), true); + final CachingIndexedFastaSequenceFile allUpper = new CachingIndexedFastaSequenceFile(new File(exampleFASTA)); + + int nMixedCase = 0; + for ( SAMSequenceRecord contig : original.getSequenceDictionary().getSequences() ) { + nMixedCase += testCases(original, casePreserving, allUpper, contig.getSequenceName(), -1, -1); + + final int step = 100; + for ( int lastPos = step; lastPos < contig.getSequenceLength(); lastPos += step ) { + testCases(original, casePreserving, allUpper, contig.getSequenceName(), lastPos - step, lastPos); + } + } + + Assert.assertTrue(nMixedCase > 0, "No mixed cases sequences found in file. Unexpected test state"); + } + + private int testCases(final IndexedFastaSequenceFile original, + final IndexedFastaSequenceFile casePreserving, + final IndexedFastaSequenceFile allUpper, + final String contig, final int start, final int stop ) { + final String orig = fetchBaseString(original, contig, start, stop); + final String keptCase = fetchBaseString(casePreserving, contig, start, stop); + final String upperCase = fetchBaseString(allUpper, contig, start, stop).toUpperCase(); + + final String origToUpper = orig.toUpperCase(); + if ( ! orig.equals(origToUpper) ) { + Assert.assertEquals(keptCase, orig, "Case preserving operation not equal to the original case for contig " + contig); + Assert.assertEquals(upperCase, origToUpper, "All upper case reader not equal to the uppercase of original case for contig " + contig); + return 1; + } else { + return 0; + } + } + + private String fetchBaseString(final IndexedFastaSequenceFile reader, final String contig, final int start, final int stop) { + if ( start == -1 ) + return new String(reader.getSequence(contig).getBases()); + else + return new String(reader.getSubsequenceAt(contig, start, stop).getBases()); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java index c3d93b2cb..b73b1a311 100644 --- a/public/java/test/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/CycleCovariateUnitTest.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.utils.recalibration; import org.broadinstitute.sting.gatk.walkers.bqsr.RecalibrationArgumentCollection; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.recalibration.covariates.CycleCovariate; import org.broadinstitute.sting.utils.sam.GATKSAMReadGroupRecord; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -24,7 +25,7 @@ public class CycleCovariateUnitTest { covariate.initialize(RAC); } - @Test(enabled = false) + @Test(enabled = true) public void testSimpleCycles() { short readLength = 10; GATKSAMRecord read = ReadUtils.createRandomRead(readLength); @@ -53,9 +54,31 @@ public class CycleCovariateUnitTest { for (short i = 0; i < values.length; i++) { short actual = Short.decode(covariate.formatKey(values[i][0])); int expected = init + (increment * i); - // System.out.println(String.format("%d: %d, %d", i, actual, expected)); Assert.assertEquals(actual, expected); } } + @Test(enabled = true, expectedExceptions={UserException.class}) + public void testMoreThanMaxCycleFails() { + int readLength = RAC.MAXIMUM_CYCLE_VALUE + 1; + GATKSAMRecord read = ReadUtils.createRandomRead(readLength); + read.setReadPairedFlag(true); + read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID")); + read.getReadGroup().setPlatform("illumina"); + + ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); + covariate.recordValues(read, readCovariates); + } + + @Test(enabled = true) + public void testMaxCyclePasses() { + int readLength = RAC.MAXIMUM_CYCLE_VALUE; + GATKSAMRecord read = ReadUtils.createRandomRead(readLength); + read.setReadPairedFlag(true); + read.setReadGroup(new GATKSAMReadGroupRecord("MY.ID")); + read.getReadGroup().setPlatform("illumina"); + + ReadCovariates readCovariates = new ReadCovariates(read.getReadLength(), 1); + covariate.recordValues(read, readCovariates); + } } diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java index 6785fa816..c57b2a44d 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextTestProvider.java @@ -782,7 +782,7 @@ public class VariantContextTestProvider { Assert.assertEquals(actual.getStart(), expected.getStart(), "start"); Assert.assertEquals(actual.getEnd(), expected.getEnd(), "end"); Assert.assertEquals(actual.getID(), expected.getID(), "id"); - Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles"); + Assert.assertEquals(actual.getAlleles(), expected.getAlleles(), "alleles for " + expected + " vs " + actual); assertAttributesEquals(actual.getAttributes(), expected.getAttributes()); Assert.assertEquals(actual.filtersWereApplied(), expected.filtersWereApplied(), "filtersWereApplied"); diff --git a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java index 114104d42..f3daa9e4c 100644 --- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUtilsUnitTest.java @@ -26,7 +26,7 @@ package org.broadinstitute.sting.utils.variantcontext; import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.testng.Assert; @@ -39,7 +39,7 @@ import java.io.FileNotFoundException; import java.util.*; public class VariantContextUtilsUnitTest extends BaseTest { - Allele Aref, T, C, Cref, ATC, ATCATC; + Allele Aref, T, C, G, Cref, ATC, ATCATC; private GenomeLocParser genomeLocParser; @BeforeSuite @@ -58,6 +58,7 @@ public class VariantContextUtilsUnitTest extends BaseTest { Cref = Allele.create("C", true); T = Allele.create("T"); C = Allele.create("C"); + G = Allele.create("G"); ATC = Allele.create("ATC"); ATCATC = Allele.create("ATCATC"); } @@ -697,10 +698,120 @@ public class VariantContextUtilsUnitTest extends BaseTest { return ReverseClippingPositionTestProvider.getTests(ReverseClippingPositionTestProvider.class); } - @Test(dataProvider = "ReverseClippingPositionTestProvider") public void testReverseClippingPositionTestProvider(ReverseClippingPositionTestProvider cfg) { int result = VariantContextUtils.computeReverseClipping(cfg.alleles, cfg.ref.getBytes(), 0, false); Assert.assertEquals(result, cfg.expectedClip); } -} + + // -------------------------------------------------------------------------------- + // + // test splitting into bi-allelics + // + // -------------------------------------------------------------------------------- + + @DataProvider(name = "SplitBiallelics") + public Object[][] makeSplitBiallelics() throws CloneNotSupportedException { + List tests = new ArrayList(); + + final VariantContextBuilder root = new VariantContextBuilder("x", "20", 10, 10, Arrays.asList(Aref, C)); + + // biallelic -> biallelic + tests.add(new Object[]{root.make(), Arrays.asList(root.make())}); + + // monos -> monos + root.alleles(Arrays.asList(Aref)); + tests.add(new Object[]{root.make(), Arrays.asList(root.make())}); + + root.alleles(Arrays.asList(Aref, C, T)); + tests.add(new Object[]{root.make(), + Arrays.asList( + root.alleles(Arrays.asList(Aref, C)).make(), + root.alleles(Arrays.asList(Aref, T)).make())}); + + root.alleles(Arrays.asList(Aref, C, T, G)); + tests.add(new Object[]{root.make(), + Arrays.asList( + root.alleles(Arrays.asList(Aref, C)).make(), + root.alleles(Arrays.asList(Aref, T)).make(), + root.alleles(Arrays.asList(Aref, G)).make())}); + + final Allele C = Allele.create("C"); + final Allele CA = Allele.create("CA"); + final Allele CAA = Allele.create("CAA"); + final Allele CAAAA = Allele.create("CAAAA"); + final Allele CAAAAA = Allele.create("CAAAAA"); + final Allele Cref = Allele.create("C", true); + final Allele CAref = Allele.create("CA", true); + final Allele CAAref = Allele.create("CAA", true); + final Allele CAAAref = Allele.create("CAAA", true); + + root.alleles(Arrays.asList(Cref, CA, CAA)); + tests.add(new Object[]{root.make(), + Arrays.asList( + root.alleles(Arrays.asList(Cref, CA)).make(), + root.alleles(Arrays.asList(Cref, CAA)).make())}); + + root.alleles(Arrays.asList(CAAref, C, CA)).stop(12); + tests.add(new Object[]{root.make(), + Arrays.asList( + root.alleles(Arrays.asList(CAAref, C)).make(), + root.alleles(Arrays.asList(CAref, C)).stop(11).make())}); + + root.alleles(Arrays.asList(CAAAref, C, CA, CAA)).stop(13); + tests.add(new Object[]{root.make(), + Arrays.asList( + root.alleles(Arrays.asList(CAAAref, C)).make(), + root.alleles(Arrays.asList(CAAref, C)).stop(12).make(), + root.alleles(Arrays.asList(CAref, C)).stop(11).make())}); + + root.alleles(Arrays.asList(CAAAref, CAAAAA, CAAAA, CAA, C)).stop(13); + tests.add(new Object[]{root.make(), + Arrays.asList( + root.alleles(Arrays.asList(Cref, CAA)).stop(10).make(), + root.alleles(Arrays.asList(Cref, CA)).stop(10).make(), + root.alleles(Arrays.asList(CAref, C)).stop(11).make(), + root.alleles(Arrays.asList(CAAAref, C)).stop(13).make())}); + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "SplitBiallelics") + public void testSplitBiallelicsNoGenotypes(final VariantContext vc, final List expectedBiallelics) { + final List biallelics = VariantContextUtils.splitVariantContextToBiallelics(vc); + Assert.assertEquals(biallelics.size(), expectedBiallelics.size()); + for ( int i = 0; i < biallelics.size(); i++ ) { + final VariantContext actual = biallelics.get(i); + final VariantContext expected = expectedBiallelics.get(i); + VariantContextTestProvider.assertEquals(actual, expected); + } + } + + @Test(dataProvider = "SplitBiallelics", dependsOnMethods = "testSplitBiallelicsNoGenotypes") + public void testSplitBiallelicsGenotypes(final VariantContext vc, final List expectedBiallelics) { + final List genotypes = new ArrayList(); + + int sampleI = 0; + for ( final List alleles : Utils.makePermutations(vc.getAlleles(), 2, true) ) { + genotypes.add(GenotypeBuilder.create("sample" + sampleI, alleles)); + } + genotypes.add(GenotypeBuilder.createMissing("missing", 2)); + + final VariantContext vcWithGenotypes = new VariantContextBuilder(vc).genotypes(genotypes).make(); + + final List biallelics = VariantContextUtils.splitVariantContextToBiallelics(vcWithGenotypes); + for ( int i = 0; i < biallelics.size(); i++ ) { + final VariantContext actual = biallelics.get(i); + Assert.assertEquals(actual.getNSamples(), vcWithGenotypes.getNSamples()); // not dropping any samples + + for ( final Genotype inputGenotype : genotypes ) { + final Genotype actualGenotype = actual.getGenotype(inputGenotype.getSampleName()); + Assert.assertNotNull(actualGenotype); + if ( ! vc.isVariant() || vc.isBiallelic() ) + Assert.assertEquals(actualGenotype, vcWithGenotypes.getGenotype(inputGenotype.getSampleName())); + else + Assert.assertTrue(actualGenotype.isNoCall()); + } + } + } +} \ No newline at end of file diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala new file mode 100644 index 000000000..28a2534c0 --- /dev/null +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/CNV/xhmmCNVpipeline.scala @@ -0,0 +1,507 @@ +package org.broadinstitute.sting.queue.qscripts.CNV + +import org.broadinstitute.sting.queue.extensions.gatk._ +import org.broadinstitute.sting.queue.QScript +import org.broadinstitute.sting.queue.util.VCF_BAM_utilities +import org.broadinstitute.sting.queue.util.DoC._ +import org.broadinstitute.sting.commandline.Hidden +import java.io.{PrintStream, PrintWriter} +import org.broadinstitute.sting.utils.text.XReadLines +import collection.JavaConversions._ +import org.broadinstitute.sting.gatk.walkers.coverage.CoverageUtils + +class xhmmCNVpipeline extends QScript { + qscript => + + @Input(doc = "bam input, as .bam or as a list of files", shortName = "I", required = true) + var bams: File = _ + + @Input(doc = "gatk jar file", shortName = "J", required = true) + var gatkJarFile: File = _ + + @Input(doc = "xhmm executable file", shortName = "xhmmExec", required = true) + var xhmmExec: File = _ + + @Input(doc = "Plink/Seq executable file", shortName = "pseqExec", required = true) + var pseqExec: File = _ + + @Argument(doc = "Plink/Seq SEQDB file (Reference genome sequence)", shortName = "SEQDB", required = true) + var pseqSeqDB: String = _ + + @Input(shortName = "R", doc = "ref", required = true) + var referenceFile: File = _ + + @Input(shortName = "L", doc = "Intervals", required = false) + var intervals: File = _ + + @Argument(doc = "level of parallelism for BAM DoC. By default is set to 0 [no scattering].", shortName = "scatter", required = false) + var scatterCountInput = 0 + + @Argument(doc = "Samples to run together for DoC. By default is set to 1 [one job per sample].", shortName = "samplesPerJob", required = false) + var samplesPerJob = 1 + + @Output(doc = "Base name for files to output", shortName = "o", required = true) + var outputBase: File = _ + + @Hidden + @Argument(doc = "How should overlapping reads from the same fragment be handled?", shortName = "countType", required = false) + var countType = CoverageUtils.CountPileupType.COUNT_FRAGMENTS + + @Argument(doc = "Maximum depth (before GATK down-sampling kicks in...)", shortName = "MAX_DEPTH", required = false) + var MAX_DEPTH = 20000 + + @Hidden + @Argument(doc = "Number of read-depth bins", shortName = "NUM_BINS", required = false) + var NUM_BINS = 200 + + @Hidden + @Argument(doc = "Starting value of read-depth bins", shortName = "START_BIN", required = false) + var START_BIN = 1 + + @Argument(doc = "Minimum read mapping quality", shortName = "MMQ", required = false) + var minMappingQuality = 0 + + @Argument(doc = "Minimum base quality to be counted in depth", shortName = "MBQ", required = false) + var minBaseQuality = 0 + + @Argument(doc = "Memory (in GB) required for storing the whole matrix in memory", shortName = "wholeMatrixMemory", required = false) + var wholeMatrixMemory = -1 + + @Argument(shortName = "minTargGC", doc = "Exclude all targets with GC content less than this value", required = false) + var minTargGC : Double = 0.1 + + @Argument(shortName = "maxTargGC", doc = "Exclude all targets with GC content greater than this value", required = false) + var maxTargGC : Double = 0.9 + + @Argument(shortName = "minTargRepeats", doc = "Exclude all targets with % of repeat-masked bases less than this value", required = false) + var minTargRepeats : Double = 0.0 + + @Argument(shortName = "maxTargRepeats", doc = "Exclude all targets with % of repeat-masked bases greater than this value", required = false) + var maxTargRepeats : Double = 0.1 + + @Argument(shortName = "sampleIDsMap", doc = "File mapping BAM sample IDs to desired sample IDs", required = false) + var sampleIDsMap: String = "" + + @Argument(shortName = "sampleIDsMapFromColumn", doc = "Column number of OLD sample IDs to map", required = false) + var sampleIDsMapFromColumn = 1 + + @Argument(shortName = "sampleIDsMapToColumn", doc = "Column number of NEW sample IDs to map", required = false) + var sampleIDsMapToColumn = 2 + + @Argument(shortName = "rawFilters", doc = "xhmm command-line parameters to filter targets and samples from raw data", required = false) + var targetSampleFiltersString: String = "" + + @Argument(shortName = "PCAnormalize", doc = "xhmm command-line parameters to Normalize data using PCA information", required = false) + var PCAnormalizeMethodString: String = "" + + @Argument(shortName = "normalizedFilters", doc = "xhmm command-line parameters to filter targets and samples from PCA-normalized data", required = false) + var targetSampleNormalizedFiltersString: String = "" + + @Argument(shortName = "xhmmParams", doc = "xhmm model parameters file", required = true) + var xhmmParamsArg: File = _ + + @Argument(shortName = "discoverParams", doc = "xhmm command-line parameters for discovery step", required = false) + var discoverCommandLineParams: String = "" + + @Argument(shortName = "genotypeParams", doc = "xhmm command-line parameters for genotyping step", required = false) + var genotypeCommandLineParams: String = "" + + @Argument(shortName = "genotypeSubsegments", doc = "Should we also genotype all subsegments of the discovered CNV?", required = false) + var genotypeSubsegments: Boolean = false + + @Argument(shortName = "maxTargetsInSubsegment", doc = "If genotypeSubsegments, then only consider sub-segments consisting of this number of targets or fewer", required = false) + var maxTargetsInSubsegment = 30 + + @Argument(shortName = "subsegmentGenotypeThreshold", doc = "If genotypeSubsegments, this is the default genotype quality threshold for the sub-segments", required = false) + var subsegmentGenotypeThreshold = 20.0 + + @Argument(shortName = "longJobQueue", doc = "Job queue to run the 'long-running' commands", required = false) + var longJobQueue: String = "" + + + val PREPARED_TARGS_SUFFIX: String = ".merged.interval_list" + + val RD_OUTPUT_SUFFIX: String = ".RD.txt" + + val TARGS_GC_SUFFIX = ".locus_GC.txt" + val EXTREME_GC_TARGS_SUFFIX = ".extreme_gc_targets.txt" + + val TARGS_REPEAT_COMPLEXITY_SUFFIX = ".locus_complexity.txt" + val EXTREME_REPEAT_COMPLEXITY_SUFFIX = ".extreme_complexity_targets.txt" + + val FILTERED_TARGS_SUFFIX: String = ".filtered_targets.txt" + val FILTERED_SAMPS_SUFFIX: String = ".filtered_samples.txt" + + + trait WholeMatrixMemoryLimit extends CommandLineFunction { + // Since loading ALL of the data can take significant memory: + if (wholeMatrixMemory < 0) { + this.memoryLimit = 24 + } + else { + this.memoryLimit = wholeMatrixMemory + } + } + + trait LongRunTime extends CommandLineFunction { + if (longJobQueue != "") + this.jobQueue = longJobQueue + } + + def script = { + val prepTargets = new PrepareTargets(List(qscript.intervals), outputBase.getPath + PREPARED_TARGS_SUFFIX, xhmmExec, referenceFile) + add(prepTargets) + + trait CommandLineGATKArgs extends CommandLineGATK { + this.intervals :+= prepTargets.out + this.jarFile = qscript.gatkJarFile + this.reference_sequence = qscript.referenceFile + this.logging_level = "INFO" + } + + val sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = VCF_BAM_utilities.getMapOfBAMsForSample(VCF_BAM_utilities.parseBAMsInput(bams)) + val samples: List[String] = sampleToBams.keys.toList + Console.out.printf("Samples are %s%n", samples) + + val groups: List[Group] = buildDoCgroups(samples, sampleToBams, samplesPerJob, outputBase) + var docs: List[DoC] = List[DoC]() + for (group <- groups) { + Console.out.printf("Group is %s%n", group) + docs ::= new DoC(group.bams, group.DoC_output, countType, MAX_DEPTH, minMappingQuality, minBaseQuality, scatterCountInput, START_BIN, NUM_BINS, Nil) with CommandLineGATKArgs + } + addAll(docs) + + val mergeDepths = new MergeGATKdepths(docs.map(u => u.intervalSampleOut), outputBase.getPath + RD_OUTPUT_SUFFIX, "_mean_cvg", xhmmExec, sampleIDsMap, sampleIDsMapFromColumn, sampleIDsMapToColumn, None, false) with WholeMatrixMemoryLimit + add(mergeDepths) + + var excludeTargets : List[File] = List[File]() + if (minTargGC > 0 || maxTargGC < 1) { + val calcGCcontents = new GCContentByInterval with CommandLineGATKArgs + calcGCcontents.out = outputBase.getPath + TARGS_GC_SUFFIX + add(calcGCcontents) + + val excludeTargetsBasedOnGC = new ExcludeTargetsBasedOnValue(calcGCcontents.out, EXTREME_GC_TARGS_SUFFIX, minTargGC, maxTargGC) + add(excludeTargetsBasedOnGC) + excludeTargets ::= excludeTargetsBasedOnGC.out + } + + class CalculateRepeatComplexity(outFile : String) extends CommandLineFunction { + @Input(doc="") + var intervals: File = prepTargets.out + + @Output(doc="") + var out : File = new File(outFile) + + val regFile : String = outputBase.getPath + ".targets.reg" + val locDB : String = outputBase.getPath + ".targets.LOCDB" + + val removeFiles = "rm -f " + regFile + " " + locDB + val createRegFile = "cat " + intervals + " | awk 'BEGIN{OFS=\"\\t\"; print \"#CHR\\tBP1\\tBP2\\tID\"} {split($1,a,\":\"); chr=a[1]; if (match(chr,\"chr\")==0) {chr=\"chr\"chr} split(a[2],b,\"-\"); bp1=b[1]; bp2=bp1; if (length(b) > 1) {bp2=b[2]} print chr,bp1,bp2,NR}' > " + regFile + val createLOCDB = pseqExec + " . loc-load --locdb " + locDB + " --file " + regFile + " --group targets --out " + locDB + ".loc-load" + val calcRepeatMaskedPercent = pseqExec + " . loc-stats --locdb " + locDB + " --group targets --seqdb " + pseqSeqDB + " --out " + locDB + ".loc-stats" + val extractRepeatMaskedPercent = "cat " + locDB + ".loc-stats.locstats | awk '{if (NR > 1) print $_}' | sort -k1 -g | awk '{print $10}' | paste " + intervals + " - | awk '{print $1\"\\t\"$2}' > " + out + + var command: String = + removeFiles + + " && " + createRegFile + + " && " + createLOCDB + + " && " + calcRepeatMaskedPercent + + " && " + extractRepeatMaskedPercent + + def commandLine = command + + override def description = "Calculate the percentage of each target that is repeat-masked in the reference sequence: " + command + } + + if (minTargRepeats > 0 || maxTargRepeats < 1) { + val calcRepeatComplexity = new CalculateRepeatComplexity(outputBase.getPath + TARGS_REPEAT_COMPLEXITY_SUFFIX) + add(calcRepeatComplexity) + + val excludeTargetsBasedOnRepeats = new ExcludeTargetsBasedOnValue(calcRepeatComplexity.out, EXTREME_REPEAT_COMPLEXITY_SUFFIX, minTargRepeats, maxTargRepeats) + add(excludeTargetsBasedOnRepeats) + excludeTargets ::= excludeTargetsBasedOnRepeats.out + } + + val filterCenterDepths = new FilterCenterRawMatrix(mergeDepths.mergedDoC, excludeTargets) + add(filterCenterDepths) + + val pca = new PCA(filterCenterDepths.filteredCentered) + add(pca) + + val normalize = new Normalize(pca) + add(normalize) + + val filterZscore = new FilterAndZscoreNormalized(normalize.normalized) + add(filterZscore) + + val filterOriginal = new FilterOriginalData(mergeDepths.mergedDoC, filterCenterDepths, filterZscore) + add(filterOriginal) + + val discover = new DiscoverCNVs(filterZscore.filteredZscored, filterOriginal.sameFiltered) + add(discover) + + val genotype = new GenotypeCNVs(filterZscore.filteredZscored, discover.xcnv, filterOriginal.sameFiltered) + add(genotype) + + if (genotypeSubsegments) { + val genotypeSegs = new GenotypeCNVandSubsegments(filterZscore.filteredZscored, discover.xcnv, filterOriginal.sameFiltered) + add(genotypeSegs) + } + } + + class ExcludeTargetsBasedOnValue(locus_valueIn : File, outSuffix : String, minVal : Double, maxVal : Double) extends InProcessFunction { + @Input(doc="") + var locus_value : File = locus_valueIn + + @Output(doc="") + var out : File = new File(outputBase.getPath + outSuffix) + + def run = { + var outWriter = new PrintWriter(new PrintStream(out)) + var elems = asScalaIterator(new XReadLines(locus_value)) + + while (elems.hasNext) { + val line = elems.next + val splitLine = line.split("\\s+") + val locus = splitLine(0) + val locValStr = splitLine(1) + try { + val locVal = locValStr.toDouble + if (locVal < minVal || locVal > maxVal) + outWriter.printf("%s%n", locus) + } + catch { + case nfe: NumberFormatException => println("Ignoring non-numeric value " + locValStr + " for locus " + locus) + case e: Exception => throw e + } + } + + outWriter.close + } + } + + class FilterCenterRawMatrix(inputParam: File, excludeTargetsIn : List[File]) extends CommandLineFunction with WholeMatrixMemoryLimit { + @Input(doc = "") + val input = inputParam + + @Input(doc = "") + val excludeTargets = excludeTargetsIn + + @Output + val filteredCentered: File = new File(outputBase.getPath + ".filtered_centered" + RD_OUTPUT_SUFFIX) + @Output + val filteredTargets: File = new File(filteredCentered.getPath + FILTERED_TARGS_SUFFIX) + @Output + val filteredSamples: File = new File(filteredCentered.getPath + FILTERED_SAMPS_SUFFIX) + + var command: String = + xhmmExec + " --matrix" + + " -r " + input + + " --centerData --centerType target" + + " -o " + filteredCentered + + " --outputExcludedTargets " + filteredTargets + + " --outputExcludedSamples " + filteredSamples + command += excludeTargets.map(u => " --excludeTargets " + u).reduceLeft(_ + "" + _) + if (targetSampleFiltersString != "") + command += " " + targetSampleFiltersString + + def commandLine = command + + override def description = "Filters samples and targets and then mean-centers the targets: " + command + } + + class PCA(inputParam: File) extends CommandLineFunction with WholeMatrixMemoryLimit { + @Input(doc = "") + val input = inputParam + + val PCAbase: String = outputBase.getPath + ".RD_PCA" + + @Output + val outPC: File = new File(PCAbase + ".PC.txt") + @Output + val outPC_SD: File = new File(PCAbase + ".PC_SD.txt") + @Output + val outPC_LOADINGS: File = new File(PCAbase + ".PC_LOADINGS.txt") + + var command: String = + xhmmExec + " --PCA" + + " -r " + input + + " --PCAfiles " + PCAbase + + def commandLine = command + + override def description = "Runs PCA on mean-centered data: " + command + } + + class Normalize(pca: PCA) extends CommandLineFunction { + @Input(doc = "") + val input = pca.input + + @Input(doc = "") + val inPC = pca.outPC + + @Input(doc = "") + val inPC_SD = pca.outPC_SD + + @Input(doc = "") + val inPC_LOADINGS = pca.outPC_LOADINGS + + @Output + val normalized: File = new File(outputBase.getPath + ".PCA_normalized.txt") + + var command: String = + xhmmExec + " --normalize" + + " -r " + input + + " --PCAfiles " + pca.PCAbase + + " --normalizeOutput " + normalized + if (PCAnormalizeMethodString != "") + command += " " + PCAnormalizeMethodString + + def commandLine = command + + override def description = "Normalizes mean-centered data using PCA information: " + command + } + + class FilterAndZscoreNormalized(inputParam: File) extends CommandLineFunction with WholeMatrixMemoryLimit { + @Input(doc = "") + val input = inputParam + + @Output + val filteredZscored: File = new File(outputBase.getPath + ".PCA_normalized.filtered.sample_zscores" + RD_OUTPUT_SUFFIX) + @Output + val filteredTargets: File = new File(filteredZscored.getPath + FILTERED_TARGS_SUFFIX) + @Output + val filteredSamples: File = new File(filteredZscored.getPath + FILTERED_SAMPS_SUFFIX) + + var command: String = + xhmmExec + " --matrix" + + " -r " + input + + " --centerData --centerType sample --zScoreData" + + " -o " + filteredZscored + + " --outputExcludedTargets " + filteredTargets + + " --outputExcludedSamples " + filteredSamples + if (targetSampleNormalizedFiltersString != "") + command += " " + targetSampleNormalizedFiltersString + + def commandLine = command + + override def description = "Filters and z-score centers (by sample) the PCA-normalized data: " + command + } + + class FilterOriginalData(inputParam: File, filt1: FilterCenterRawMatrix, filt2: FilterAndZscoreNormalized) extends CommandLineFunction with WholeMatrixMemoryLimit { + @Input(doc = "") + val input = inputParam + + @Input(doc = "") + val targFilters: List[File] = List(filt1.filteredTargets, filt2.filteredTargets).map(u => new File(u)) + + @Input(doc = "") + val sampFilters: List[File] = List(filt1.filteredSamples, filt2.filteredSamples).map(u => new File(u)) + + @Output + val sameFiltered: File = new File(outputBase.getPath + ".same_filtered" + RD_OUTPUT_SUFFIX) + + var command: String = + xhmmExec + " --matrix" + + " -r " + input + + targFilters.map(u => " --excludeTargets " + u).reduceLeft(_ + "" + _) + + sampFilters.map(u => " --excludeSamples " + u).reduceLeft(_ + "" + _) + + " -o " + sameFiltered + + def commandLine = command + + override def description = "Filters original read-depth data to be the same as filtered, normalized data: " + command + } + + class DiscoverCNVs(inputParam: File, origRDParam: File) extends CommandLineFunction with LongRunTime { + @Input(doc = "") + val input = inputParam + + @Input(doc = "") + val xhmmParams = xhmmParamsArg + + @Input(doc = "") + val origRD = origRDParam + + @Output + val xcnv: File = new File(outputBase.getPath + ".xcnv") + + @Output + val aux_xcnv: File = new File(outputBase.getPath + ".aux_xcnv") + + val posteriorsBase = outputBase.getPath + + @Output + val dipPosteriors: File = new File(posteriorsBase + ".posteriors.DIP.txt") + + @Output + val delPosteriors: File = new File(posteriorsBase + ".posteriors.DEL.txt") + + @Output + val dupPosteriors: File = new File(posteriorsBase + ".posteriors.DUP.txt") + + var command: String = + xhmmExec + " --discover" + + " -p " + xhmmParams + + " -r " + input + + " -R " + origRD + + " -c " + xcnv + + " -a " + aux_xcnv + + " -s " + posteriorsBase + + " " + discoverCommandLineParams + + def commandLine = command + + override def description = "Discovers CNVs in normalized data: " + command + } + + abstract class BaseGenotypeCNVs(inputParam: File, xcnv: File, origRDParam: File) extends CommandLineFunction with LongRunTime { + @Input(doc = "") + val input = inputParam + + @Input(doc = "") + val xhmmParams = xhmmParamsArg + + @Input(doc = "") + val origRD = origRDParam + + @Input(doc = "") + val inXcnv = xcnv + + var command: String = + xhmmExec + " --genotype" + + " -p " + xhmmParams + + " -r " + input + + " -g " + inXcnv + + " -F " + referenceFile + + " -R " + origRD + + " " + genotypeCommandLineParams + } + + class GenotypeCNVs(inputParam: File, xcnv: File, origRDParam: File) extends BaseGenotypeCNVs(inputParam, xcnv, origRDParam) { + @Output + val vcf: File = new File(outputBase.getPath + ".vcf") + + command += + " -v " + vcf + + def commandLine = command + + override def description = "Genotypes discovered CNVs in all samples: " + command + } + + class GenotypeCNVandSubsegments(inputParam: File, xcnv: File, origRDParam: File) extends BaseGenotypeCNVs(inputParam, xcnv, origRDParam) { + @Output + val vcf: File = new File(outputBase.getPath + ".subsegments.vcf") + + command += + " -v " + vcf + + " --subsegments" + + " --maxTargetsInSubsegment " + maxTargetsInSubsegment + + " --genotypeQualThresholdWhenNoExact " + subsegmentGenotypeThreshold + + def commandLine = command + + override def description = "Genotypes discovered CNVs (and their sub-segments, of up to " + maxTargetsInSubsegment + " targets) in all samples: " + command + } +} diff --git a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala index 24ab50451..dc6cae197 100755 --- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala +++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/GATKResourcesBundle.scala @@ -147,13 +147,13 @@ class GATKResourcesBundle extends QScript { // // example call set for wiki tutorial // - addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/exampleCalls/NA12878.HiSeq.WGS.bwa.cleaned.raw.hg19.subset.vcf", + addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/exampleCalls/NA12878.HiSeq.WGS.bwa.cleaned.raw.b37.subset.vcf", "NA12878.HiSeq.WGS.bwa.cleaned.raw.subset", b37, true, true)) // // Test BAM file, specific to each reference // - addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam", + addResource(new Resource("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.b37.20.bam", "IGNORE", b37, false, false)) // diff --git a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala index 637174557..f899af86d 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QCommandLine.scala @@ -110,95 +110,103 @@ class QCommandLine extends CommandLineProgram with Logging { * functions, and then builds and runs a QGraph based on the dependencies. */ def execute = { - ClassFieldCache.parsingEngine = this.parser + var success = false + var result = 1 + try { + ClassFieldCache.parsingEngine = this.parser - if (settings.qSettings.runName == null) - settings.qSettings.runName = FilenameUtils.removeExtension(scripts.head.getName) - if (IOUtils.isDefaultTempDir(settings.qSettings.tempDirectory)) - settings.qSettings.tempDirectory = IOUtils.absolute(settings.qSettings.runDirectory, ".queue/tmp") - qGraph.initializeWithSettings(settings) + if (settings.qSettings.runName == null) + settings.qSettings.runName = FilenameUtils.removeExtension(scripts.head.getName) + if (IOUtils.isDefaultTempDir(settings.qSettings.tempDirectory)) + settings.qSettings.tempDirectory = IOUtils.absolute(settings.qSettings.runDirectory, ".queue/tmp") + qGraph.initializeWithSettings(settings) - for (commandPlugin <- allCommandPlugins) { - loadArgumentsIntoObject(commandPlugin) - } - - for (commandPlugin <- allCommandPlugins) { - if (commandPlugin.statusMessenger != null) - commandPlugin.statusMessenger.started() - } - - qGraph.messengers = allCommandPlugins.filter(_.statusMessenger != null).map(_.statusMessenger).toSeq - - // TODO: Default command plugin argument? - val remoteFileConverter = ( - for (commandPlugin <- allCommandPlugins if (commandPlugin.remoteFileConverter != null)) - yield commandPlugin.remoteFileConverter - ).headOption.getOrElse(null) - - if (remoteFileConverter != null) - loadArgumentsIntoObject(remoteFileConverter) - - val allQScripts = qScriptPluginManager.createAllTypes() - for (script <- allQScripts) { - logger.info("Scripting " + qScriptPluginManager.getName(script.getClass.asSubclass(classOf[QScript]))) - loadArgumentsIntoObject(script) - allCommandPlugins.foreach(_.initScript(script)) - // TODO: Pulling inputs can be time/io expensive! Some scripts are using the files to generate functions-- even for dry runs-- so pull it all down for now. - //if (settings.run) - script.pullInputs() - script.qSettings = settings.qSettings - try { - script.script() - } catch { - case e: Exception => - throw new UserException.CannotExecuteQScript(script.getClass.getSimpleName + ".script() threw the following exception: " + e, e) + for (commandPlugin <- allCommandPlugins) { + loadArgumentsIntoObject(commandPlugin) } - if (remoteFileConverter != null) { - if (remoteFileConverter.convertToRemoteEnabled) - script.mkRemoteOutputs(remoteFileConverter) - } - - script.functions.foreach(qGraph.add(_)) - logger.info("Added " + script.functions.size + " functions") - } - // Execute the job graph - qGraph.run() - - val functionsAndStatus = qGraph.getFunctionsAndStatus - val success = qGraph.success - - // walk over each script, calling onExecutionDone - for (script <- allQScripts) { - val scriptFunctions = functionsAndStatus.filterKeys(f => script.functions.contains(f)) - script.onExecutionDone(scriptFunctions, success) - } - - logger.info("Script %s with %d total jobs".format(if (success) "completed successfully" else "failed", functionsAndStatus.size)) - - // write the final complete job report - logger.info("Writing final jobs report...") - qGraph.writeJobsReport() - - if (!success) { - logger.info("Done with errors") - qGraph.logFailed() - for (commandPlugin <- allCommandPlugins) + for (commandPlugin <- allCommandPlugins) { if (commandPlugin.statusMessenger != null) - commandPlugin.statusMessenger.exit("Done with errors: %s".format(qGraph.formattedStatusCounts)) - 1 - } else { - if (settings.run) { - allQScripts.foreach(_.pushOutputs()) - for (commandPlugin <- allCommandPlugins) - if (commandPlugin.statusMessenger != null) { - val allInputs = allQScripts.map(_.remoteInputs) - val allOutputs = allQScripts.map(_.remoteOutputs) - commandPlugin.statusMessenger.done(allInputs, allOutputs) - } + commandPlugin.statusMessenger.started() + } + + qGraph.messengers = allCommandPlugins.filter(_.statusMessenger != null).map(_.statusMessenger).toSeq + + // TODO: Default command plugin argument? + val remoteFileConverter = ( + for (commandPlugin <- allCommandPlugins if (commandPlugin.remoteFileConverter != null)) + yield commandPlugin.remoteFileConverter + ).headOption.getOrElse(null) + + if (remoteFileConverter != null) + loadArgumentsIntoObject(remoteFileConverter) + + val allQScripts = qScriptPluginManager.createAllTypes() + for (script <- allQScripts) { + logger.info("Scripting " + qScriptPluginManager.getName(script.getClass.asSubclass(classOf[QScript]))) + loadArgumentsIntoObject(script) + allCommandPlugins.foreach(_.initScript(script)) + // TODO: Pulling inputs can be time/io expensive! Some scripts are using the files to generate functions-- even for dry runs-- so pull it all down for now. + //if (settings.run) + script.pullInputs() + script.qSettings = settings.qSettings + try { + script.script() + } catch { + case e: Exception => + throw new UserException.CannotExecuteQScript(script.getClass.getSimpleName + ".script() threw the following exception: " + e, e) + } + + if (remoteFileConverter != null) { + if (remoteFileConverter.convertToRemoteEnabled) + script.mkRemoteOutputs(remoteFileConverter) + } + + script.functions.foreach(qGraph.add(_)) + logger.info("Added " + script.functions.size + " functions") + } + // Execute the job graph + qGraph.run() + + val functionsAndStatus = qGraph.getFunctionsAndStatus + + // walk over each script, calling onExecutionDone + for (script <- allQScripts) { + val scriptFunctions = functionsAndStatus.filterKeys(f => script.functions.contains(f)) + script.onExecutionDone(scriptFunctions, success) + } + + logger.info("Script %s with %d total jobs".format(if (success) "completed successfully" else "failed", functionsAndStatus.size)) + + // write the final complete job report + logger.info("Writing final jobs report...") + qGraph.writeJobsReport() + + if (qGraph.success) { + if (settings.run) { + allQScripts.foreach(_.pushOutputs()) + for (commandPlugin <- allCommandPlugins) + if (commandPlugin.statusMessenger != null) { + val allInputs = allQScripts.map(_.remoteInputs) + val allOutputs = allQScripts.map(_.remoteOutputs) + commandPlugin.statusMessenger.done(allInputs, allOutputs) + } + } + success = true + result = 0 + } + } finally { + if (!success) { + logger.info("Done with errors") + qGraph.logFailed() + if (settings.run) { + for (commandPlugin <- allCommandPlugins) + if (commandPlugin.statusMessenger != null) + commandPlugin.statusMessenger.exit("Done with errors: %s".format(qGraph.formattedStatusCounts)) + } } - 0 } + result } /** diff --git a/public/scala/src/org/broadinstitute/sting/queue/QScript.scala b/public/scala/src/org/broadinstitute/sting/queue/QScript.scala index eb8be183a..5b67ae913 100755 --- a/public/scala/src/org/broadinstitute/sting/queue/QScript.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/QScript.scala @@ -124,49 +124,26 @@ trait QScript extends Logging with PrimitiveOptionConversions with StringFileCon } /** - * Pull all remote files to the local disk. + * Pull all remote files to the local disk */ def pullInputs() { - val inputs = ClassFieldCache.getFieldFiles(this, inputFields) - for (remoteFile <- filterRemoteFiles(inputs)) { - logger.info("Pulling %s from %s".format(remoteFile.getAbsolutePath, remoteFile.remoteDescription)) - remoteFile.pullToLocal() - } } /** - * Push all remote files from the local disk. + * Push all remote files from the local disk */ def pushOutputs() { - val outputs = ClassFieldCache.getFieldFiles(this, outputFields) - for (remoteFile <- filterRemoteFiles(outputs)) { - logger.info("Pushing %s to %s".format(remoteFile.getAbsolutePath, remoteFile.remoteDescription)) - remoteFile.pushToRemote() - } } /** - * List out the remote outputs - * @return the RemoteFile outputs by argument source + * @return the inputs or null if there are no inputs */ - def remoteInputs: Map[String, Seq[RemoteFile]] = tagMap(remoteFieldMap(inputFields)) + def remoteInputs: AnyRef = null /** - * List out the remote outputs - * @return the RemoteFile outputs by argument source + * @return the outputs or null if there are no outputs */ - def remoteOutputs: Map[String, Seq[RemoteFile]] = tagMap(remoteFieldMap(outputFields)) - - private def tagMap(remoteFieldMap: Map[ArgumentSource, Seq[RemoteFile]]): Map[String, Seq[RemoteFile]] = { - remoteFieldMap.collect{ case (k, v) => ClassFieldCache.fullName(k) -> v }.toMap - } - - private def remoteFieldMap(fields: Seq[ArgumentSource]): Map[ArgumentSource, Seq[RemoteFile]] = { - fields.map(field => (field -> filterRemoteFiles(ClassFieldCache.getFieldFiles(this, field)))).filter(tuple => !tuple._2.isEmpty).toMap - } - - private def filterRemoteFiles(fields: Seq[File]): Seq[RemoteFile] = - fields.filter(field => field != null && field.isInstanceOf[RemoteFile]).map(_.asInstanceOf[RemoteFile]) + def remoteOutputs: AnyRef = null /** The complete list of fields. */ def functionFields: Seq[ArgumentSource] = ClassFieldCache.classFunctionFields(this.getClass) diff --git a/public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala b/public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala index a1133b944..a69f68b8e 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/engine/QStatusMessenger.scala @@ -7,7 +7,7 @@ import org.broadinstitute.sting.queue.util.RemoteFile */ trait QStatusMessenger { def started() - def done(inputs: Seq[Map[String, Seq[RemoteFile]]], outputs: Seq[Map[String, Seq[RemoteFile]]]) + def done(inputs: Seq[_], outputs: Seq[_]) def exit(message: String) def started(job: String) diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/DoC/package.scala b/public/scala/src/org/broadinstitute/sting/queue/util/DoC/package.scala new file mode 100644 index 000000000..2b19b0f8e --- /dev/null +++ b/public/scala/src/org/broadinstitute/sting/queue/util/DoC/package.scala @@ -0,0 +1,131 @@ +package org.broadinstitute.sting.queue.util + +import java.io.File +import org.broadinstitute.sting.queue.extensions.gatk.{IntervalScatterFunction, CommandLineGATK} +import org.broadinstitute.sting.queue.function.scattergather.ScatterGatherableFunction +import org.broadinstitute.sting.gatk.downsampling.DownsampleType +import org.broadinstitute.sting.commandline.{Input, Gather, Output} +import org.broadinstitute.sting.queue.function.CommandLineFunction +import org.broadinstitute.sting.gatk.walkers.coverage.CoverageUtils + +package object DoC { + class DoC(val bams: List[File], val DoC_output: File, val countType: CoverageUtils.CountPileupType, val MAX_DEPTH: Int, val minMappingQuality: Int, val minBaseQuality: Int, val scatterCountInput: Int, val START_BIN: Int, val NUM_BINS: Int, val minCoverageCalcs: Seq[Int]) extends CommandLineGATK with ScatterGatherableFunction { + val DOC_OUTPUT_SUFFIX: String = ".sample_interval_summary" + + // So that the output files of this DoC run get deleted once they're used further downstream: + this.isIntermediate = true + + this.analysis_type = "DepthOfCoverage" + + this.input_file = bams + + this.downsample_to_coverage = Some(MAX_DEPTH) + this.downsampling_type = DownsampleType.BY_SAMPLE + + this.scatterCount = scatterCountInput + this.scatterClass = classOf[IntervalScatterFunction] + + // HACK for DoC to work properly within Queue: + @Output + @Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction]) + var intervalSampleOut: File = new File(DoC_output.getPath + DOC_OUTPUT_SUFFIX) + + override def commandLine = super.commandLine + + " --omitDepthOutputAtEachBase" + + " --omitLocusTable" + + " --minMappingQuality " + minMappingQuality + + " --minBaseQuality " + minBaseQuality + + optional("--countType", countType, spaceSeparated=true, escape=true, format="%s") + + " --start " + START_BIN + " --stop " + MAX_DEPTH + " --nBins " + NUM_BINS + + (if (!minCoverageCalcs.isEmpty) minCoverageCalcs.map(cov => " --summaryCoverageThreshold " + cov).reduceLeft(_ + "" + _) else "") + + " --includeRefNSites" + + " -o " + DoC_output + + override def shortDescription = "DoC: " + DoC_output + } + + class DoCwithDepthOutputAtEachBase(bams: List[File], DoC_output: File, countType: CoverageUtils.CountPileupType, MAX_DEPTH: Int, minMappingQuality: Int, minBaseQuality: Int, scatterCountInput: Int, START_BIN: Int, NUM_BINS: Int, minCoverageCalcs: Seq[Int]) extends DoC(bams, DoC_output, countType: CoverageUtils.CountPileupType, MAX_DEPTH: Int, minMappingQuality, minBaseQuality, scatterCountInput, START_BIN, NUM_BINS, minCoverageCalcs) { + // HACK for DoC to work properly within Queue: + @Output + @Gather(classOf[org.broadinstitute.sting.queue.function.scattergather.SimpleTextGatherFunction]) + var outPrefix = DoC_output + + override def commandLine = super.commandLine.replaceAll(" --omitDepthOutputAtEachBase", "") + } + + def buildDoCgroups(samples: List[String], sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]], samplesPerJob: Int, outputBase: File): List[Group] = { + var l: List[Group] = Nil + + var remaining = samples + var subsamples: List[String] = Nil + var count = 1 + + while (!remaining.isEmpty) { + val splitRes = (remaining splitAt samplesPerJob) + subsamples = splitRes._1 + remaining = splitRes._2 + l ::= new Group("group" + count, outputBase, subsamples, VCF_BAM_utilities.findBAMsForSamples(subsamples, sampleToBams)) + count = count + 1 + } + + return l + } + + // A group has a list of samples and bam files to use for DoC + class Group(val name: String, val outputBase: File, val samples: List[String], val bams: List[File]) { + // getName() just includes the file name WITHOUT the path: + val groupOutputName = name + "." + outputBase.getName + + // Comment this out, so that when jobs are scattered in DoC class below, they do not scatter into outputs at directories that don't exist!!! : + //def DoC_output = new File(outputBase.getParentFile(), groupOutputName) + + def DoC_output = new File(groupOutputName) + + override def toString(): String = String.format("[Group %s [%s] with samples %s against bams %s]", name, DoC_output, samples, bams) + } + + class MergeGATKdepths(DoCsToCombine: List[File], outFile: String, columnSuffix: String, xhmmExec: File, sampleIDsMap: String, sampleIDsMapFromColumn: Int, sampleIDsMapToColumn: Int, rdPrecisionArg: Option[Int], outputTargetsBySamples: Boolean) extends CommandLineFunction { + @Input(doc = "") + var inputDoCfiles: List[File] = DoCsToCombine + + @Output + val mergedDoC: File = new File(outFile) + var command: String = + xhmmExec + " --mergeGATKdepths" + + inputDoCfiles.map(input => " --GATKdepths " + input).reduceLeft(_ + "" + _) + + " --columnSuffix " + columnSuffix + + " -o " + mergedDoC + if (sampleIDsMap != "") + command += " --sampleIDmap " + sampleIDsMap + " --fromID " + sampleIDsMapFromColumn + " --toID " + sampleIDsMapToColumn + rdPrecisionArg match { + case Some(rdPrecision) => { + command += " --rdPrecision " + rdPrecision + } + case None => {} + } + if (outputTargetsBySamples) + command += " --outputTargetsBySamples" + + def commandLine = command + + override def description = "Combines DoC outputs for multiple samples (at same loci): " + command + } + + class PrepareTargets(intervalsIn: List[File], outIntervals: String, val xhmmExec: File, val referenceFile: File) extends CommandLineFunction { + @Input(doc = "List of files containing targeted intervals to be prepared and merged") + var inIntervals: List[File] = intervalsIn + + @Output(doc = "The merged intervals file to write to") + var out: File = new File(outIntervals) + + var command: String = + xhmmExec + " --prepareTargets" + + " -F " + referenceFile + + inIntervals.map(intervFile => " --targets " + intervFile).reduceLeft(_ + "" + _) + + " --mergedTargets " + out + + def commandLine = command + + override def description = "Sort all target intervals, merge overlapping ones, and print the resulting interval list: " + command + } +} diff --git a/public/scala/src/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala b/public/scala/src/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala index 1f18858e1..3fe867981 100644 --- a/public/scala/src/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala +++ b/public/scala/src/org/broadinstitute/sting/queue/util/VCF_BAM_utilities.scala @@ -26,36 +26,31 @@ object VCF_BAM_utilities { case _ => throw new RuntimeException("Unexpected BAM input type: " + bamsIn + "; only permitted extensions are .bam and .list") } - def getMapOfBAMsForSample(bams: List[File]): scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = bams match { - case Nil => return scala.collection.mutable.Map.empty[String, scala.collection.mutable.Set[File]] - - case x :: y => - val m: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = getMapOfBAMsForSample(y) - val bamSamples: List[String] = getSamplesInBAM(x) + def getMapOfBAMsForSample(bams: List[File]): scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]] = { + var m = scala.collection.mutable.Map.empty[String, scala.collection.mutable.Set[File]] + for (bam <- bams) { + val bamSamples: List[String] = getSamplesInBAM(bam) for (s <- bamSamples) { if (!m.contains(s)) m += s -> scala.collection.mutable.Set.empty[File] - m(s) = m(s) + x + m(s) += bam } + } return m } def findBAMsForSamples(samples: List[String], sampleToBams: scala.collection.mutable.Map[String, scala.collection.mutable.Set[File]]): List[File] = { + var s = scala.collection.mutable.Set.empty[File] - def findBAMsForSamplesHelper(samples: List[String]): scala.collection.mutable.Set[File] = samples match { - case Nil => scala.collection.mutable.Set.empty[File] - - case x :: y => - var bamsForSampleX: scala.collection.mutable.Set[File] = scala.collection.mutable.Set.empty[File] - if (sampleToBams.contains(x)) - bamsForSampleX = sampleToBams(x) - return bamsForSampleX ++ findBAMsForSamplesHelper(y) + for (sample <- samples) { + if (sampleToBams.contains(sample)) + s ++= sampleToBams(sample) } val l: List[File] = Nil - return l ++ findBAMsForSamplesHelper(samples) + return l ++ s } } diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala index 944ef7977..60c9d9a59 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/DataProcessingPipelineTest.scala @@ -28,7 +28,7 @@ import org.testng.annotations.Test import org.broadinstitute.sting.BaseTest class DataProcessingPipelineTest { - @Test + @Test(timeOut=36000000) def testSimpleBAM { val projectName = "test1" val testOut = projectName + ".exampleBAM.bam.clean.dedup.recal.bam" @@ -45,7 +45,7 @@ class DataProcessingPipelineTest { PipelineTest.executeTest(spec) } - @Test + @Test(timeOut=36000000) def testBWAPEBAM { val projectName = "test2" val testOut = projectName + ".exampleBAM.bam.clean.dedup.recal.bam" diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala index 3e9af3e68..dd07cbfdc 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/PacbioProcessingPipelineTest.scala @@ -28,7 +28,7 @@ import org.testng.annotations.Test import org.broadinstitute.sting.BaseTest class PacbioProcessingPipelineTest { - @Test + @Test(timeOut=36000000) def testPacbioProcessingPipeline { val testOut = "exampleBAM.recal.bam" val spec = new PipelineTestSpec diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/DevNullOutputPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/DevNullOutputPipelineTest.scala index 92c40acb1..6bc6b56db 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/DevNullOutputPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/DevNullOutputPipelineTest.scala @@ -53,7 +53,7 @@ import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec} import org.broadinstitute.sting.BaseTest class DevNullOutputPipelineTest { - @Test + @Test(timeOut=36000000) def testDevNullOutput() { val spec = new PipelineTestSpec spec.name = "devnulloutput" diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala index 9d885dda2..f52632a7f 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountLociPipelineTest.scala @@ -29,7 +29,7 @@ import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec} import org.broadinstitute.sting.BaseTest class ExampleCountLociPipelineTest { - @Test + @Test(timeOut=36000000) def testCountLoci() { val testOut = "count.out" val spec = new PipelineTestSpec diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountReadsPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountReadsPipelineTest.scala index 1b965d8d2..c23c12719 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountReadsPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleCountReadsPipelineTest.scala @@ -29,7 +29,7 @@ import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec} import org.broadinstitute.sting.BaseTest class ExampleCountReadsPipelineTest { - @Test + @Test(timeOut=36000000) def testCountReads() { val spec = new PipelineTestSpec spec.name = "countreads" diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleReadFilterPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleReadFilterPipelineTest.scala index c6e4c3507..4ffaf7b5c 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleReadFilterPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleReadFilterPipelineTest.scala @@ -77,7 +77,7 @@ import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec} import org.broadinstitute.sting.BaseTest class ExampleReadFilterPipelineTest { - @Test + @Test(timeOut=36000000) def testExampleReadFilter() { val spec = new PipelineTestSpec spec.name = "examplereadfilter" diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleRetryMemoryLimitPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleRetryMemoryLimitPipelineTest.scala index a9a5928fc..0215a389c 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleRetryMemoryLimitPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleRetryMemoryLimitPipelineTest.scala @@ -29,7 +29,7 @@ import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec} import org.broadinstitute.sting.BaseTest class ExampleRetryMemoryLimitPipelineTest { - @Test + @Test(timeOut=36000000) def testRetryMemoryLimit() { val spec = new PipelineTestSpec spec.name = "RetryMemoryLimit" diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala index f6fcd7c12..67ac68c28 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/ExampleUnifiedGenotyperPipelineTest.scala @@ -29,7 +29,7 @@ import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec} import org.broadinstitute.sting.BaseTest class ExampleUnifiedGenotyperPipelineTest { - @Test + @Test(timeOut=36000000) def testUnifiedGenotyper() { val spec = new PipelineTestSpec spec.name = "unifiedgenotyper" @@ -51,7 +51,7 @@ class ExampleUnifiedGenotyperPipelineTest { Array("vcf_intervals", BaseTest.validationDataLocation + "intervalTest.1.vcf") ).asInstanceOf[Array[Array[Object]]] - @Test(dataProvider = "ugIntervals") + @Test(dataProvider = "ugIntervals", timeOut=36000000) def testUnifiedGenotyperWithIntervals(intervalsName: String, intervalsPath: String) { val spec = new PipelineTestSpec spec.name = "unifiedgenotyper_with_" + intervalsName @@ -64,7 +64,7 @@ class ExampleUnifiedGenotyperPipelineTest { PipelineTest.executeTest(spec) } - @Test + @Test(timeOut=36000000) def testUnifiedGenotyperNoGCOpt() { val spec = new PipelineTestSpec spec.name = "unifiedgenotyper_no_gc_opt" @@ -80,7 +80,7 @@ class ExampleUnifiedGenotyperPipelineTest { @DataProvider(name="resMemReqParams") def getResMemReqParam = Array(Array("mem_free"), Array("virtual_free")).asInstanceOf[Array[Array[Object]]] - @Test(dataProvider = "resMemReqParams") + @Test(dataProvider = "resMemReqParams", timeOut=36000000) def testUnifiedGenotyperResMemReqParam(reqParam: String) { val spec = new PipelineTestSpec spec.name = "unifiedgenotyper_" + reqParam diff --git a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala index a43727ba6..50fc529dd 100644 --- a/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala +++ b/public/scala/test/org/broadinstitute/sting/queue/pipeline/examples/HelloWorldPipelineTest.scala @@ -28,7 +28,7 @@ import org.testng.annotations.Test import org.broadinstitute.sting.queue.pipeline.{PipelineTest, PipelineTestSpec} class HelloWorldPipelineTest { - @Test + @Test(timeOut=36000000) def testHelloWorld() { val spec = new PipelineTestSpec spec.name = "HelloWorld" @@ -37,7 +37,7 @@ class HelloWorldPipelineTest { PipelineTest.executeTest(spec) } - @Test + @Test(timeOut=36000000) def testHelloWorldWithRunName() { val spec = new PipelineTestSpec spec.name = "HelloWorldWithRunName" @@ -47,7 +47,7 @@ class HelloWorldPipelineTest { PipelineTest.executeTest(spec) } - @Test + @Test(timeOut=36000000) def testHelloWorldWithMemoryLimit() { val spec = new PipelineTestSpec spec.name = "HelloWorldMemoryLimit" @@ -57,7 +57,7 @@ class HelloWorldPipelineTest { PipelineTest.executeTest(spec) } - @Test + @Test(timeOut=36000000) def testHelloWorldWithPriority() { val spec = new PipelineTestSpec spec.name = "HelloWorldWithPriority" @@ -67,7 +67,7 @@ class HelloWorldPipelineTest { PipelineTest.executeTest(spec) } - @Test + @Test(timeOut=36000000) def testHelloWorldWithLsfResource() { val spec = new PipelineTestSpec spec.name = "HelloWorldWithLsfResource" @@ -77,7 +77,7 @@ class HelloWorldPipelineTest { PipelineTest.executeTest(spec) } - @Test + @Test(timeOut=36000000) def testHelloWorldWithLsfResourceAndMemoryLimit() { val spec = new PipelineTestSpec spec.name = "HelloWorldWithLsfResourceAndMemoryLimit" @@ -87,7 +87,7 @@ class HelloWorldPipelineTest { PipelineTest.executeTest(spec) } - @Test + @Test(timeOut=36000000) def testHelloWorldWithLsfEnvironment() { val spec = new PipelineTestSpec spec.name = "HelloWorldWithLsfEnvironment" @@ -97,7 +97,7 @@ class HelloWorldPipelineTest { PipelineTest.executeTest(spec) } - @Test + @Test(timeOut=36000000) def testHelloWorldWithGridEngineResource() { val spec = new PipelineTestSpec spec.name = "HelloWorldWithGridEngineResource" @@ -107,7 +107,7 @@ class HelloWorldPipelineTest { PipelineTest.executeTest(spec) } - @Test + @Test(timeOut=36000000) def testHelloWorldWithGridEngineResourceAndMemoryLimit() { val spec = new PipelineTestSpec spec.name = "HelloWorldWithGridEngineResourceAndMemoryLimit" @@ -117,7 +117,7 @@ class HelloWorldPipelineTest { PipelineTest.executeTest(spec) } - @Test + @Test(timeOut=36000000) def testHelloWorldWithGridEngineEnvironment() { val spec = new PipelineTestSpec spec.name = "HelloWorldWithGridEngineEnvironment" diff --git a/settings/repository/org.broad/tribble-110.jar b/settings/repository/org.broad/tribble-119.jar similarity index 78% rename from settings/repository/org.broad/tribble-110.jar rename to settings/repository/org.broad/tribble-119.jar index f8e312ad9..c74bea398 100644 Binary files a/settings/repository/org.broad/tribble-110.jar and b/settings/repository/org.broad/tribble-119.jar differ diff --git a/settings/repository/org.broad/tribble-110.xml b/settings/repository/org.broad/tribble-119.xml similarity index 79% rename from settings/repository/org.broad/tribble-110.xml rename to settings/repository/org.broad/tribble-119.xml index 84a550b27..08037b20e 100644 --- a/settings/repository/org.broad/tribble-110.xml +++ b/settings/repository/org.broad/tribble-119.xml @@ -1,3 +1,3 @@ - +