diff --git a/build.xml b/build.xml index a93918ec8..834aef3cd 100644 --- a/build.xml +++ b/build.xml @@ -185,10 +185,7 @@ - - - - + @@ -205,6 +202,16 @@ + + + + + + + + + + @@ -226,20 +233,20 @@ - - - - + + + + + + + + + + + - - - - - - - @@ -672,6 +679,24 @@ + + + + + + + + + + + + + + + + + + @@ -1103,15 +1128,10 @@ - - + - - - - @@ -1119,9 +1139,6 @@ - - - @@ -1129,10 +1146,8 @@ - - + - @@ -1143,11 +1158,9 @@ - + - - @@ -1160,9 +1173,8 @@ - + - @@ -1369,14 +1381,13 @@ - + - @@ -1394,13 +1405,27 @@ - - + + + + + + + + + + + + + + + + diff --git a/ivy.xml b/ivy.xml index 1d2f95dc1..b7ca65406 100644 --- a/ivy.xml +++ b/ivy.xml @@ -24,11 +24,8 @@ - + - - - @@ -83,9 +80,9 @@ - - - + + + diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java index 3cdf3d75e..629a27c48 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/ReduceReads.java @@ -253,7 +253,6 @@ public class ReduceReads extends ReadWalker, ReduceRea intervalList.addAll(toolkit.getIntervals()); - // todo -- rework the whole NO_PG_TAG thing final boolean preSorted = true; final boolean indexOnTheFly = true; final boolean keep_records = true; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java index 24a3ba3cb..fff1c20a5 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/compression/reducereads/SlidingWindow.java @@ -220,7 +220,6 @@ public class SlidingWindow { regions = findVariantRegions(0, breakpoint, markedSites.getVariantSiteBitSet(), !forceClose); } - // todo -- can be more aggressive here removing until the NEW window header start location after closing the variant regions while (!readsInWindow.isEmpty() && readsInWindow.first().getSoftEnd() < windowHeaderStartLocation) { readsInWindow.pollFirst(); } @@ -607,9 +606,7 @@ public class SlidingWindow { toRemove.add(read); } } - for (GATKSAMRecord read : toRemove) { - readsInWindow.remove(read); - } + removeReadsFromWindow(toRemove); } return allReads; } @@ -805,9 +802,8 @@ public class SlidingWindow { hetReads.add(finalizeRunningConsensus()); } - for (GATKSAMRecord read : toRemove) { - readsInWindow.remove(read); - } + removeReadsFromWindow(toRemove); + return hetReads; } @@ -924,5 +920,11 @@ public class SlidingWindow { } } } + + private void removeReadsFromWindow (List readsToRemove) { + for (GATKSAMRecord read : readsToRemove) { + readsInWindow.remove(read); + } + } } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index d91df82e2..4fc2dc8f7 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -31,7 +31,6 @@ import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; -import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; @@ -42,163 +41,26 @@ import java.util.*; public class GenotypingEngine { private final boolean DEBUG; - private final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE; private final static List noCall = new ArrayList(); // used to noCall all genotypes until the exact model is applied private final static Allele SYMBOLIC_UNASSEMBLED_EVENT_ALLELE = Allele.create("", false); - public GenotypingEngine( final boolean DEBUG, final boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) { + public GenotypingEngine( final boolean DEBUG ) { this.DEBUG = DEBUG; - this.OUTPUT_FULL_HAPLOTYPE_SEQUENCE = OUTPUT_FULL_HAPLOTYPE_SEQUENCE; noCall.add(Allele.NO_CALL); } - // WARN - // This function is the streamlined approach, currently not being used by default - // WARN - // WARN: This function is currently only being used by Menachem. Slated for removal/merging with the rest of the code. - // WARN - @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) - public List>>> assignGenotypeLikelihoodsAndCallHaplotypeEvents( final UnifiedGenotyperEngine UG_engine, - final ArrayList haplotypes, - final byte[] ref, - final GenomeLoc refLoc, - final GenomeLoc activeRegionWindow, - final GenomeLocParser genomeLocParser ) { - // Prepare the list of haplotype indices to genotype - final ArrayList allelesToGenotype = new ArrayList(); - - for( final Haplotype h : haplotypes ) { - allelesToGenotype.add( Allele.create(h.getBases(), h.isReference()) ); - } - final int numHaplotypes = haplotypes.size(); - - // Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample - final GenotypesContext genotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size()); - for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples - final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2]; - final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(haplotypes, sample); - int glIndex = 0; - for( int iii = 0; iii < numHaplotypes; iii++ ) { - for( int jjj = 0; jjj <= iii; jjj++ ) { - genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC - } - } - genotypes.add(new GenotypeBuilder(sample, noCall).PL(genotypeLikelihoods).make()); - } - final VariantCallContext call = UG_engine.calculateGenotypes(new VariantContextBuilder().loc(activeRegionWindow).alleles(allelesToGenotype).genotypes(genotypes).make(), UG_engine.getUAC().GLmodel); - if( call == null ) { return Collections.emptyList(); } // exact model says that the call confidence is below the specified confidence threshold so nothing to do here - - // Prepare the list of haplotypes that need to be run through Smith-Waterman for output to VCF - final ArrayList haplotypesToRemove = new ArrayList(); - for( final Haplotype h : haplotypes ) { - if( call.getAllele(h.getBases()) == null ) { // exact model removed this allele from the list so no need to run SW and output to VCF - haplotypesToRemove.add(h); - } - } - haplotypes.removeAll(haplotypesToRemove); - - if( OUTPUT_FULL_HAPLOTYPE_SEQUENCE ) { - final List>>> returnVCs = new ArrayList>>>(); - // set up the default 1-to-1 haplotype mapping object - final HashMap> haplotypeMapping = new HashMap>(); - for( final Haplotype h : haplotypes ) { - final ArrayList list = new ArrayList(); - list.add(h); - haplotypeMapping.put(call.getAllele(h.getBases()), list); - } - returnVCs.add( new Pair>>(call,haplotypeMapping) ); - return returnVCs; - } - - final ArrayList>>> returnCalls = new ArrayList>>>(); - - // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file - final TreeSet startPosKeySet = new TreeSet(); - int count = 0; - if( DEBUG ) { System.out.println("=== Best Haplotypes ==="); } - for( final Haplotype h : haplotypes ) { - if( DEBUG ) { - System.out.println( h.toString() ); - System.out.println( "> Cigar = " + h.getCigar() ); - } - // Walk along the alignment and turn any difference from the reference into an event - h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) ); - startPosKeySet.addAll(h.getEventMap().keySet()); - } - - // Create the VC merge priority list - final ArrayList priorityList = new ArrayList(); - for( int iii = 0; iii < haplotypes.size(); iii++ ) { - priorityList.add("HC" + iii); - } - - // Walk along each position in the key set and create each event to be outputted - for( final int loc : startPosKeySet ) { - if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { - final ArrayList eventsAtThisLoc = new ArrayList(); - for( final Haplotype h : haplotypes ) { - final HashMap eventMap = h.getEventMap(); - final VariantContext vc = eventMap.get(loc); - if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) { - eventsAtThisLoc.add(vc); - } - } - - // Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event - final ArrayList> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes ); - - // Merge the event to find a common reference representation - final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); - - final HashMap> alleleHashMap = new HashMap>(); - int aCount = 0; - for( final Allele a : mergedVC.getAlleles() ) { - alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper - } - - if( DEBUG ) { - System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); - //System.out.println("Event/haplotype allele mapping = " + alleleMapper); - } - - // Grab the genotype likelihoods from the appropriate places in the haplotype likelihood matrix -- calculation performed independently per sample - final GenotypesContext myGenotypes = GenotypesContext.create(haplotypes.get(0).getSampleKeySet().size()); - for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples - final int myNumHaplotypes = alleleMapper.size(); - final double[] genotypeLikelihoods = new double[myNumHaplotypes * (myNumHaplotypes+1) / 2]; - final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper); - int glIndex = 0; - for( int iii = 0; iii < myNumHaplotypes; iii++ ) { - for( int jjj = 0; jjj <= iii; jjj++ ) { - genotypeLikelihoods[glIndex++] = haplotypeLikelihoodMatrix[iii][jjj]; // for example: AA,AB,BB,AC,BC,CC - } - } - - // using the allele mapping object translate the haplotype allele into the event allele - final Genotype g = new GenotypeBuilder(sample) - .alleles(findEventAllelesInSample(mergedVC.getAlleles(), call.getAlleles(), call.getGenotype(sample).getAlleles(), alleleMapper, haplotypes)) - .phased(loc != startPosKeySet.first()) - .PL(genotypeLikelihoods).make(); - myGenotypes.add(g); - } - returnCalls.add( new Pair>>( - new VariantContextBuilder(mergedVC).log10PError(call.getLog10PError()).genotypes(myGenotypes).make(), alleleHashMap) ); - } - } - return returnCalls; - } - // BUGBUG: Create a class to hold this complicated return type @Requires({"refLoc.containsP(activeRegionWindow)", "haplotypes.size() > 0"}) - public List>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine, - final ArrayList haplotypes, - final byte[] ref, - final GenomeLoc refLoc, - final GenomeLoc activeRegionWindow, - final GenomeLocParser genomeLocParser, - final ArrayList activeAllelesToGenotype ) { + public List>>> assignGenotypeLikelihoodsAndCallIndependentEvents( final UnifiedGenotyperEngine UG_engine, + final List haplotypes, + final byte[] ref, + final GenomeLoc refLoc, + final GenomeLoc activeRegionWindow, + final GenomeLocParser genomeLocParser, + final List activeAllelesToGenotype ) { - final ArrayList>>> returnCalls = new ArrayList>>>(); + final ArrayList>>> returnCalls = new ArrayList>>>(); + final boolean in_GGA_mode = !activeAllelesToGenotype.isEmpty(); // Using the cigar from each called haplotype figure out what events need to be written out in a VCF file final TreeSet startPosKeySet = new TreeSet(); @@ -207,7 +69,7 @@ public class GenotypingEngine { for( final Haplotype h : haplotypes ) { // Walk along the alignment and turn any difference from the reference into an event h.setEventMap( generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), ref, h.getBases(), refLoc, "HC" + count++ ) ); - if( activeAllelesToGenotype.isEmpty() ) { startPosKeySet.addAll(h.getEventMap().keySet()); } + if( !in_GGA_mode ) { startPosKeySet.addAll(h.getEventMap().keySet()); } if( DEBUG ) { System.out.println( h.toString() ); System.out.println( "> Cigar = " + h.getCigar() ); @@ -217,10 +79,10 @@ public class GenotypingEngine { } cleanUpSymbolicUnassembledEvents( haplotypes ); - if( activeAllelesToGenotype.isEmpty() && haplotypes.get(0).getSampleKeySet().size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure + if( !in_GGA_mode && haplotypes.get(0).getSampleKeySet().size() >= 10 ) { // if not in GGA mode and have at least 10 samples try to create MNP and complex events by looking at LD structure mergeConsecutiveEventsBasedOnLD( haplotypes, startPosKeySet, ref, refLoc ); } - if( !activeAllelesToGenotype.isEmpty() ) { // we are in GGA mode! + if( in_GGA_mode ) { for( final VariantContext compVC : activeAllelesToGenotype ) { startPosKeySet.add( compVC.getStart() ); } @@ -232,7 +94,7 @@ public class GenotypingEngine { final ArrayList eventsAtThisLoc = new ArrayList(); // the overlapping events to merge into a common reference view final ArrayList priorityList = new ArrayList(); // used to merge overlapping events into common reference view - if( activeAllelesToGenotype.isEmpty() ) { + if( !in_GGA_mode ) { for( final Haplotype h : haplotypes ) { final HashMap eventMap = h.getEventMap(); final VariantContext vc = eventMap.get(loc); @@ -261,7 +123,14 @@ public class GenotypingEngine { if( eventsAtThisLoc.isEmpty() ) { continue; } // Create the allele mapping object which maps the original haplotype alleles to the alleles present in just this event - final ArrayList> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes ); + Map> alleleMapper = createAlleleMapper( loc, eventsAtThisLoc, haplotypes ); + + final Allele refAllele = eventsAtThisLoc.get(0).getReference(); + final ArrayList alleleOrdering = new ArrayList(alleleMapper.size()); + alleleOrdering.add(refAllele); + for( final VariantContext vc : eventsAtThisLoc ) { + alleleOrdering.add(vc.getAlternateAllele(0)); + } // Sanity check the priority list for( final VariantContext vc : eventsAtThisLoc ) { @@ -283,11 +152,15 @@ public class GenotypingEngine { final VariantContext mergedVC = VariantContextUtils.simpleMerge(genomeLocParser, eventsAtThisLoc, priorityList, VariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, VariantContextUtils.GenotypeMergeType.PRIORITIZE, false, false, null, false, false); if( mergedVC == null ) { continue; } - HashMap> alleleHashMap = new HashMap>(); - int aCount = 0; - for( final Allele a : mergedVC.getAlleles() ) { - alleleHashMap.put(a, alleleMapper.get(aCount++)); // BUGBUG: needs to be cleaned up and merged with alleleMapper + // let's update the Allele keys in the mapper because they can change after merging when there are complex events + Map> updatedAlleleMapper = new HashMap>(alleleMapper.size()); + for ( int i = 0; i < mergedVC.getNAlleles(); i++ ) { + final Allele oldAllele = alleleOrdering.get(i); + final Allele newAllele = mergedVC.getAlleles().get(i); + updatedAlleleMapper.put(newAllele, alleleMapper.get(oldAllele)); + alleleOrdering.set(i, newAllele); } + alleleMapper = updatedAlleleMapper; if( DEBUG ) { System.out.println("Genotyping event at " + loc + " with alleles = " + mergedVC.getAlleles()); @@ -299,7 +172,7 @@ public class GenotypingEngine { for( final String sample : haplotypes.get(0).getSampleKeySet() ) { // BUGBUG: assume all haplotypes saw the same samples final int numHaplotypes = alleleMapper.size(); final double[] genotypeLikelihoods = new double[numHaplotypes * (numHaplotypes+1) / 2]; - final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper); + final double[][] haplotypeLikelihoodMatrix = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sample, alleleMapper, alleleOrdering); int glIndex = 0; for( int iii = 0; iii < numHaplotypes; iii++ ) { for( int jjj = 0; jjj <= iii; jjj++ ) { @@ -313,23 +186,23 @@ public class GenotypingEngine { if( call.getAlleles().size() != mergedVC.getAlleles().size() ) { // some alleles were removed so reverseTrimming might be necessary! final VariantContext vcCallTrim = VariantContextUtils.reverseTrimAlleles(call); // also, need to update the allele -> haplotype mapping - final HashMap> alleleHashMapTrim = new HashMap>(); + final HashMap> alleleHashMapTrim = new HashMap>(); for( int iii = 0; iii < vcCallTrim.getAlleles().size(); iii++ ) { // BUGBUG: this is assuming that the original and trimmed alleles maintain the same ordering in the VC - alleleHashMapTrim.put(vcCallTrim.getAlleles().get(iii), alleleHashMap.get(call.getAlleles().get(iii))); + alleleHashMapTrim.put(vcCallTrim.getAlleles().get(iii), alleleMapper.get(call.getAlleles().get(iii))); } call = vcCallTrim; - alleleHashMap = alleleHashMapTrim; + alleleMapper = alleleHashMapTrim; } - returnCalls.add( new Pair>>(call, alleleHashMap) ); + returnCalls.add( new Pair>>(call, alleleMapper) ); } } } return returnCalls; } - protected static void cleanUpSymbolicUnassembledEvents( final ArrayList haplotypes ) { + protected static void cleanUpSymbolicUnassembledEvents( final List haplotypes ) { final ArrayList haplotypesToRemove = new ArrayList(); for( final Haplotype h : haplotypes ) { for( final VariantContext vc : h.getEventMap().values() ) { @@ -348,7 +221,7 @@ public class GenotypingEngine { haplotypes.removeAll(haplotypesToRemove); } - protected void mergeConsecutiveEventsBasedOnLD( final ArrayList haplotypes, final TreeSet startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) { + protected void mergeConsecutiveEventsBasedOnLD( final List haplotypes, final TreeSet startPosKeySet, final byte[] ref, final GenomeLoc refLoc ) { final int MAX_SIZE_TO_COMBINE = 15; final double MERGE_EVENTS_R2_THRESHOLD = 0.95; if( startPosKeySet.size() <= 1 ) { return; } @@ -395,7 +268,9 @@ public class GenotypingEngine { final ArrayList haplotypeList = new ArrayList(); haplotypeList.add(h); for( final String sample : haplotypes.get(0).getSampleKeySet() ) { - final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( haplotypeList, sample )[0][0]; + final HashSet sampleSet = new HashSet(1); + sampleSet.add(sample); + final double haplotypeLikelihood = LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods( sampleSet, haplotypeList )[0][0]; if( thisHapVC == null ) { if( nextHapVC == null ) { x11 = MathUtils.approximateLog10SumLog10(x11, haplotypeLikelihood); } else { x12 = MathUtils.approximateLog10SumLog10(x12, haplotypeLikelihood); } @@ -489,37 +364,87 @@ public class GenotypingEngine { @Requires({"haplotypes.size() >= eventsAtThisLoc.size() + 1"}) @Ensures({"result.size() == eventsAtThisLoc.size() + 1"}) - protected static ArrayList> createAlleleMapper( final int loc, final ArrayList eventsAtThisLoc, final ArrayList haplotypes ) { - final ArrayList> alleleMapper = new ArrayList>(); - final ArrayList refList = new ArrayList(); + protected static Map> createAlleleMapper( final int loc, final List eventsAtThisLoc, final List haplotypes ) { + + final Map> alleleMapper = new HashMap>(eventsAtThisLoc.size()+1); + final Allele refAllele = eventsAtThisLoc.get(0).getReference(); + alleleMapper.put(refAllele, new ArrayList()); + for( final VariantContext vc : eventsAtThisLoc ) + alleleMapper.put(vc.getAlternateAllele(0), new ArrayList()); + + final ArrayList undeterminedHaplotypes = new ArrayList(haplotypes.size()); for( final Haplotype h : haplotypes ) { if( h.getEventMap().get(loc) == null ) { // no event at this location so this is a reference-supporting haplotype - refList.add(h); + alleleMapper.get(refAllele).add(h); + } else if( h.isArtificialHaplotype() && loc == h.getArtificialAllelePosition() && alleleMapper.containsKey(h.getArtificialAllele()) ) { + alleleMapper.get(h.getArtificialAllele()).add(h); } else { - boolean foundInEventList = false; + boolean haplotypeIsDetermined = false; for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) { if( h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) { - foundInEventList = true; + alleleMapper.get(vcAtThisLoc.getAlternateAllele(0)).add(h); + haplotypeIsDetermined = true; + break; } } - if( !foundInEventList ) { // event at this location isn't one of the genotype-able options (during GGA) so this is a reference-supporting haplotype - refList.add(h); - } + + if( !haplotypeIsDetermined ) + undeterminedHaplotypes.add(h); } } - alleleMapper.add(refList); - for( final VariantContext vcAtThisLoc : eventsAtThisLoc ) { - final ArrayList list = new ArrayList(); - for( final Haplotype h : haplotypes ) { - if( h.getEventMap().get(loc) != null && h.getEventMap().get(loc).hasSameAllelesAs(vcAtThisLoc) ) { - list.add(h); + + for( final Haplotype h : undeterminedHaplotypes ) { + Allele matchingAllele = null; + for( final Map.Entry> alleleToTest : alleleMapper.entrySet() ) { + // don't test against the reference allele + if( alleleToTest.getKey().equals(refAllele) ) + continue; + + final Haplotype artificialHaplotype = alleleToTest.getValue().get(0); + if( isSubSetOf(artificialHaplotype.getEventMap(), h.getEventMap(), true) ) { + matchingAllele = alleleToTest.getKey(); + break; } } - alleleMapper.add(list); + + if( matchingAllele == null ) + matchingAllele = refAllele; + alleleMapper.get(matchingAllele).add(h); } + return alleleMapper; } + protected static boolean isSubSetOf(final Map subset, final Map superset, final boolean resolveSupersetToSubset) { + + for ( final Map.Entry fromSubset : subset.entrySet() ) { + final VariantContext fromSuperset = superset.get(fromSubset.getKey()); + if ( fromSuperset == null ) + return false; + + List supersetAlleles = fromSuperset.getAlternateAlleles(); + if ( resolveSupersetToSubset ) + supersetAlleles = resolveAlternateAlleles(fromSubset.getValue().getReference(), fromSuperset.getReference(), supersetAlleles); + + if ( !supersetAlleles.contains(fromSubset.getValue().getAlternateAllele(0)) ) + return false; + } + + return true; + } + + private static List resolveAlternateAlleles(final Allele targetReference, final Allele actualReference, final List currentAlleles) { + if ( targetReference.length() <= actualReference.length() ) + return currentAlleles; + + final List newAlleles = new ArrayList(currentAlleles.size()); + final byte[] extraBases = Arrays.copyOfRange(targetReference.getBases(), actualReference.length(), targetReference.length()); + for ( final Allele a : currentAlleles ) { + newAlleles.add(Allele.extend(a, extraBases)); + } + return newAlleles; + } + @Ensures({"result.size() == haplotypeAllelesForSample.size()"}) protected static List findEventAllelesInSample( final List eventAlleles, final List haplotypeAlleles, final List haplotypeAllelesForSample, final ArrayList> alleleMapper, final ArrayList haplotypes ) { if( haplotypeAllelesForSample.contains(Allele.NO_CALL) ) { return noCall; } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index a185ba6af..d194e2620 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -45,6 +45,7 @@ import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedArgumentCollection import org.broadinstitute.sting.gatk.walkers.genotyper.UnifiedGenotyperEngine; import org.broadinstitute.sting.gatk.walkers.genotyper.VariantCallContext; import org.broadinstitute.sting.utils.*; +import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.codecs.vcf.*; @@ -131,14 +132,6 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="minPruning", shortName="minPruning", doc = "The minimum allowed pruning factor in assembly graph. Paths with <= X supporting kmers are pruned from the graph", required = false) protected int MIN_PRUNE_FACTOR = 1; - @Advanced - @Argument(fullName="genotypeFullActiveRegion", shortName="genotypeFullActiveRegion", doc = "If specified, alternate alleles are considered to be the full active region for the purposes of genotyping", required = false) - protected boolean GENOTYPE_FULL_ACTIVE_REGION = false; - - @Advanced - @Argument(fullName="fullHaplotype", shortName="fullHaplotype", doc = "If specified, output the full haplotype sequence instead of converting to individual variants w.r.t. the reference", required = false) - protected boolean OUTPUT_FULL_HAPLOTYPE_SEQUENCE = false; - @Advanced @Argument(fullName="gcpHMM", shortName="gcpHMM", doc="Flat gap continuation penalty for use in the Pair HMM", required = false) protected int gcpHMM = 10; @@ -248,10 +241,11 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // create a UAC but with the exactCallsLog = null, so we only output the log for the HC caller itself, if requested UnifiedArgumentCollection simpleUAC = new UnifiedArgumentCollection(UAC); - simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; // low values used for isActive determination only, default/user-specified values used for actual calling - simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; // low values used for isActive determination only, default/user-specified values used for actual calling - simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); - simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.max( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); + simpleUAC.OutputMode = UnifiedGenotyperEngine.OUTPUT_MODE.EMIT_VARIANTS_ONLY; + simpleUAC.GenotypingMode = GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.DISCOVERY; + simpleUAC.STANDARD_CONFIDENCE_FOR_CALLING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_CALLING ); // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.STANDARD_CONFIDENCE_FOR_EMITTING = Math.min( 4.0, UAC.STANDARD_CONFIDENCE_FOR_EMITTING ); // low values used for isActive determination only, default/user-specified values used for actual calling + simpleUAC.CONTAMINATION_FRACTION = 0.0; simpleUAC.exactCallsLog = null; UG_engine_simple_genotyper = new UnifiedGenotyperEngine(getToolkit(), simpleUAC, logger, null, null, samples, VariantContextUtils.DEFAULT_PLOIDY); @@ -273,15 +267,6 @@ public class HaplotypeCaller extends ActiveRegionWalker implem VCFConstants.GENOTYPE_QUALITY_KEY, VCFConstants.DEPTH_KEY, VCFConstants.GENOTYPE_PL_KEY); - // header lines for the experimental HaplotypeCaller-specific annotations - headerInfo.add(new VCFInfoHeaderLine("NVH", 1, VCFHeaderLineType.Integer, "Number of variants found on the haplotype that contained this variant")); - headerInfo.add(new VCFInfoHeaderLine("NumHapEval", 1, VCFHeaderLineType.Integer, "Number of haplotypes that were chosen for evaluation in this active region")); - headerInfo.add(new VCFInfoHeaderLine("NumHapAssembly", 1, VCFHeaderLineType.Integer, "Number of haplotypes created during the assembly of this active region")); - headerInfo.add(new VCFInfoHeaderLine("ActiveRegionSize", 1, VCFHeaderLineType.Integer, "Number of base pairs that comprise this active region")); - headerInfo.add(new VCFInfoHeaderLine("EVENTLENGTH", 1, VCFHeaderLineType.Integer, "Max length of all the alternate alleles")); - headerInfo.add(new VCFInfoHeaderLine("TYPE", 1, VCFHeaderLineType.String, "Type of event: SNP or INDEL")); - headerInfo.add(new VCFInfoHeaderLine("extType", 1, VCFHeaderLineType.String, "Extended type of event: SNP, MNP, INDEL, or COMPLEX")); - headerInfo.add(new VCFInfoHeaderLine("QDE", 1, VCFHeaderLineType.Float, "QD value divided by the number of variants found on the haplotype that contained this variant")); // FILTER fields are added unconditionally as it's not always 100% certain the circumstances // where the filters are used. For example, in emitting all sites the lowQual field is used @@ -298,7 +283,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem assemblyEngine = new SimpleDeBruijnAssembler( DEBUG, graphWriter ); likelihoodCalculationEngine = new LikelihoodCalculationEngine( (byte)gcpHMM, DEBUG, pairHMM ); - genotypingEngine = new GenotypingEngine( DEBUG, OUTPUT_FULL_HAPLOTYPE_SEQUENCE ); + genotypingEngine = new GenotypingEngine( DEBUG ); } //--------------------------------------------------------------------------------------------------------------- @@ -311,9 +296,15 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Override public boolean includeReadsWithDeletionAtLoci() { return true; } - // enable non primary reads in the active region + // enable non primary and extended reads in the active region @Override - public boolean wantsNonPrimaryReads() { return true; } + public EnumSet desiredReadStates() { + return EnumSet.of( + ActiveRegionReadState.PRIMARY, + ActiveRegionReadState.NONPRIMARY, + ActiveRegionReadState.EXTENDED + ); + } @Override @Ensures({"result.isActiveProb >= 0.0", "result.isActiveProb <= 1.0"}) @@ -421,52 +412,13 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes ) final ArrayList bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? likelihoodCalculationEngine.selectBestHaplotypes( haplotypes ) : haplotypes ); - for( final Pair>> callResult : - ( GENOTYPE_FULL_ACTIVE_REGION && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES - ? genotypingEngine.assignGenotypeLikelihoodsAndCallHaplotypeEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getExtendedLoc(), getToolkit().getGenomeLocParser() ) - : genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) ) { + for( final Pair>> callResult : + genotypingEngine.assignGenotypeLikelihoodsAndCallIndependentEvents( UG_engine, bestHaplotypes, fullReferenceWithPadding, getPaddedLoc(activeRegion), activeRegion.getLocation(), getToolkit().getGenomeLocParser(), activeAllelesToGenotype ) ) { if( DEBUG ) { System.out.println(callResult.getFirst().toStringWithoutGenotypes()); } final Map stratifiedReadMap = LikelihoodCalculationEngine.partitionReadsBasedOnLikelihoods( getToolkit().getGenomeLocParser(), perSampleReadList, perSampleFilteredReadList, callResult, UG_engine.getUAC().CONTAMINATION_FRACTION, UG_engine.getUAC().contaminationLog ); final VariantContext annotatedCall = annotationEngine.annotateContext(stratifiedReadMap, callResult.getFirst()); final Map myAttributes = new LinkedHashMap(annotatedCall.getAttributes()); - - if( !GENOTYPE_FULL_ACTIVE_REGION ) { - // add some custom annotations to the calls - - // Calculate the number of variants on the haplotype - int maxNumVar = 0; - for( final Allele allele : callResult.getFirst().getAlleles() ) { - if( !allele.isReference() ) { - for( final Haplotype haplotype : callResult.getSecond().get(allele) ) { - final int numVar = haplotype.getEventMap().size(); - if( numVar > maxNumVar ) { maxNumVar = numVar; } - } - } - } - // Calculate the event length - int maxLength = 0; - for ( final Allele a : annotatedCall.getAlternateAlleles() ) { - final int length = a.length() - annotatedCall.getReference().length(); - if( Math.abs(length) > Math.abs(maxLength) ) { maxLength = length; } - } - - myAttributes.put("NVH", maxNumVar); - myAttributes.put("NumHapEval", bestHaplotypes.size()); - myAttributes.put("NumHapAssembly", haplotypes.size()); - myAttributes.put("ActiveRegionSize", activeRegion.getLocation().size()); - myAttributes.put("EVENTLENGTH", maxLength); - myAttributes.put("TYPE", (annotatedCall.isSNP() || annotatedCall.isMNP() ? "SNP" : "INDEL") ); - myAttributes.put("extType", annotatedCall.getType().toString() ); - - //if( likelihoodCalculationEngine.haplotypeScore != null ) { - // myAttributes.put("HaplotypeScore", String.format("%.4f", likelihoodCalculationEngine.haplotypeScore)); - //} - if( annotatedCall.hasAttribute("QD") ) { - myAttributes.put("QDE", String.format("%.2f", Double.parseDouble((String)annotatedCall.getAttribute("QD")) / ((double)maxNumVar)) ); - } - } - vcfWriter.add( new VariantContextBuilder(annotatedCall).attributes(myAttributes).make() ); } @@ -522,6 +474,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) { final GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY ); // protect against INTERVALS with abnormally high coverage + // BUGBUG: remove when positinal downsampler is hooked up to ART/HC if( clippedRead.getReadLength() > 0 && activeRegion.size() < samplesList.size() * DOWNSAMPLE_PER_SAMPLE_PER_REGION ) { activeRegion.add(clippedRead); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index a0924623b..29622ca17 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -148,45 +148,31 @@ public class LikelihoodCalculationEngine { return Math.min(b1.length, b2.length); } - @Requires({"haplotypes.size() > 0"}) - @Ensures({"result.length == result[0].length", "result.length == haplotypes.size()"}) - public static double[][] computeDiploidHaplotypeLikelihoods( final ArrayList haplotypes, final String sample ) { - // set up the default 1-to-1 haplotype mapping object, BUGBUG: target for future optimization? - final ArrayList> haplotypeMapping = new ArrayList>(); - for( final Haplotype h : haplotypes ) { - final ArrayList list = new ArrayList(); - list.add(h); - haplotypeMapping.add(list); - } - return computeDiploidHaplotypeLikelihoods( sample, haplotypeMapping ); - } - // This function takes just a single sample and a haplotypeMapping @Requires({"haplotypeMapping.size() > 0"}) @Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"}) - public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, final ArrayList> haplotypeMapping ) { + public static double[][] computeDiploidHaplotypeLikelihoods( final String sample, final Map> haplotypeMapping, final List alleleOrdering ) { final TreeSet sampleSet = new TreeSet(); sampleSet.add(sample); - return computeDiploidHaplotypeLikelihoods(sampleSet, haplotypeMapping); + return computeDiploidHaplotypeLikelihoods(sampleSet, haplotypeMapping, alleleOrdering); } // This function takes a set of samples to pool over and a haplotypeMapping @Requires({"haplotypeMapping.size() > 0"}) @Ensures({"result.length == result[0].length", "result.length == haplotypeMapping.size()"}) - public static double[][] computeDiploidHaplotypeLikelihoods( final Set samples, final ArrayList> haplotypeMapping ) { + public static double[][] computeDiploidHaplotypeLikelihoods( final Set samples, final Map> haplotypeMapping, final List alleleOrdering ) { - final int numHaplotypes = haplotypeMapping.size(); + final int numHaplotypes = alleleOrdering.size(); final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes]; for( int iii = 0; iii < numHaplotypes; iii++ ) { Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY); } // compute the diploid haplotype likelihoods - // todo - needs to be generalized to arbitrary ploidy, cleaned and merged with PairHMMIndelErrorModel code for( int iii = 0; iii < numHaplotypes; iii++ ) { - for( int jjj = 0; jjj <= iii; jjj++ ) { - for( final Haplotype iii_mapped : haplotypeMapping.get(iii) ) { - for( final Haplotype jjj_mapped : haplotypeMapping.get(jjj) ) { + for( int jjj = 0; jjj <= iii; jjj++ ) { + for( final Haplotype iii_mapped : haplotypeMapping.get(alleleOrdering.get(iii)) ) { + for( final Haplotype jjj_mapped : haplotypeMapping.get(alleleOrdering.get(jjj)) ) { double haplotypeLikelihood = 0.0; for( final String sample : samples ) { final double[] readLikelihoods_iii = iii_mapped.getReadLikelihoods(sample); @@ -200,12 +186,48 @@ public class LikelihoodCalculationEngine { } haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); } - } + } } } // normalize the diploid likelihoods matrix - return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix ); + return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix ); + } + + // This function takes a set of samples to pool over and a haplotypeMapping + @Requires({"haplotypeList.size() > 0"}) + @Ensures({"result.length == result[0].length", "result.length == haplotypeList.size()"}) + public static double[][] computeDiploidHaplotypeLikelihoods( final Set samples, final List haplotypeList ) { + + final int numHaplotypes = haplotypeList.size(); + final double[][] haplotypeLikelihoodMatrix = new double[numHaplotypes][numHaplotypes]; + for( int iii = 0; iii < numHaplotypes; iii++ ) { + Arrays.fill(haplotypeLikelihoodMatrix[iii], Double.NEGATIVE_INFINITY); + } + + // compute the diploid haplotype likelihoods + // todo - needs to be generalized to arbitrary ploidy, cleaned and merged with PairHMMIndelErrorModel code + for( int iii = 0; iii < numHaplotypes; iii++ ) { + final Haplotype iii_haplotype = haplotypeList.get(iii); + for( int jjj = 0; jjj <= iii; jjj++ ) { + final Haplotype jjj_haplotype = haplotypeList.get(jjj); + double haplotypeLikelihood = 0.0; + for( final String sample : samples ) { + final double[] readLikelihoods_iii = iii_haplotype.getReadLikelihoods(sample); + final int[] readCounts_iii = iii_haplotype.getReadCounts(sample); + final double[] readLikelihoods_jjj = jjj_haplotype.getReadLikelihoods(sample); + for( int kkk = 0; kkk < readLikelihoods_iii.length; kkk++ ) { + // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+10^x2)-log10(2) + // First term is approximated by Jacobian log with table lookup. + haplotypeLikelihood += readCounts_iii[kkk] * ( MathUtils.approximateLog10SumLog10(readLikelihoods_iii[kkk], readLikelihoods_jjj[kkk]) + LOG_ONE_HALF ); + } + } + haplotypeLikelihoodMatrix[iii][jjj] = Math.max(haplotypeLikelihoodMatrix[iii][jjj], haplotypeLikelihood); + } + } + + // normalize the diploid likelihoods matrix + return normalizeDiploidLikelihoodMatrixFromLog10( haplotypeLikelihoodMatrix ); } @Requires({"likelihoodMatrix.length == likelihoodMatrix[0].length"}) @@ -296,14 +318,7 @@ public class LikelihoodCalculationEngine { final Set sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples final ArrayList bestHaplotypesIndexList = new ArrayList(); bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype - // set up the default 1-to-1 haplotype mapping object - final ArrayList> haplotypeMapping = new ArrayList>(); - for( final Haplotype h : haplotypes ) { - final ArrayList list = new ArrayList(); - list.add(h); - haplotypeMapping.add(list); - } - final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, haplotypeMapping ); // all samples pooled together + final double[][] haplotypeLikelihoodMatrix = computeDiploidHaplotypeLikelihoods( sampleKeySet, haplotypes ); // all samples pooled together int hap1 = 0; int hap2 = 0; @@ -347,7 +362,7 @@ public class LikelihoodCalculationEngine { public static Map partitionReadsBasedOnLikelihoods( final GenomeLocParser parser, final HashMap> perSampleReadList, final HashMap> perSampleFilteredReadList, - final Pair>> call, + final Pair>> call, final double downsamplingFraction, final PrintStream downsamplingLog ) { final Map returnMap = new HashMap(); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java index fd46e4e69..4f072d720 100755 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java @@ -278,9 +278,10 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef(); final int activeRegionStop = refHaplotype.getAlignmentStartHapwrtRef() + refHaplotype.getCigar().getReferenceLength(); - for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype + // for GGA mode, add the desired allele into the haplotype + for( final VariantContext compVC : activeAllelesToGenotype ) { for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { - final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart()); + final Haplotype insertedRefHaplotype = refHaplotype.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()); if( !addHaplotype( insertedRefHaplotype, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) { return returnHaplotypes; //throw new ReviewedStingException("Unable to add reference+allele haplotype during GGA-enabled assembly: " + insertedRefHaplotype); @@ -290,15 +291,24 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { for( final DefaultDirectedGraph graph : graphs ) { for ( final KBestPaths.Path path : KBestPaths.getKBestPaths(graph, NUM_BEST_PATHS_PER_KMER_GRAPH) ) { + final Haplotype h = new Haplotype( path.getBases( graph ), path.getScore() ); if( addHaplotype( h, fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ) ) { - if( !activeAllelesToGenotype.isEmpty() ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present + + // for GGA mode, add the desired allele into the haplotype if it isn't already present + if( !activeAllelesToGenotype.isEmpty() ) { final HashMap eventMap = GenotypingEngine.generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), fullReferenceWithPadding, h.getBases(), refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart()); - if( vcOnHaplotype == null || !vcOnHaplotype.hasSameAllelesAs(compVC) ) { + + // This if statement used to additionally have: + // "|| !vcOnHaplotype.hasSameAllelesAs(compVC)" + // but that can lead to problems downstream when e.g. you are injecting a 1bp deletion onto + // a haplotype that already contains a 1bp insertion (so practically it is reference but + // falls into the bin for the 1bp deletion because we keep track of the artificial alleles). + if( vcOnHaplotype == null ) { for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { - addHaplotype( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart()), fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ); + addHaplotype( h.insertAllele(compVC.getReference(), compAltAllele, activeRegionStart + compVC.getStart() - activeRegionWindow.getStart(), compVC.getStart()), fullReferenceWithPadding, returnHaplotypes, activeRegionStart, activeRegionStop ); } } } @@ -369,6 +379,8 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { h.setAlignmentStartHapwrtRef( swConsensus2.getAlignmentStart2wrt1() ); h.setCigar( AlignmentUtils.leftAlignIndel(swConsensus2.getCigar(), ref, h.getBases(), swConsensus2.getAlignmentStart2wrt1(), 0) ); + if ( haplotype.isArtificialHaplotype() ) + h.setArtificialAllele(haplotype.getArtificialAllele(), haplotype.getArtificialAllelePosition()); h.leftBreakPoint = leftBreakPoint; h.rightBreakPoint = rightBreakPoint; if( swConsensus2.getCigar().toString().contains("S") || swConsensus2.getCigar().getReferenceLength() != activeRegionStop - activeRegionStart ) { // protect against SW failures diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java index f6ec47760..de328c825 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java @@ -51,16 +51,16 @@ public class BQSRIntegrationTest extends WalkerTest { String HiSeqBam = privateTestDir + "HiSeq.1mb.1RG.bam"; String HiSeqInterval = "chr1:10,000,000-10,100,000"; return new Object[][]{ - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "387b41dc2221a1a4a782958944662b25")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "b5e26902e76abbd59f94f65c70d18165")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "a8a9c3f83269911cb61c5fe8fb98dc4a")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "f43a0473101c63ae93444c300d843e81")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "9e05e63339d4716584bfc717cab6bd0f")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "1cf9b9c9c64617dc0f3d2f203f918dbe")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "aa1949a77bc3066fee551a217c970c0d")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "4fd3c9ad97e6ac58cba644a76564c9f7")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "2620f734cce20f70ce13afd880e46e5c")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "5eb3b94e767da19a4c037ee132e4b19a")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "ab261d291b107a3da7897759c0e4fa89")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "292303f649fbb19dc05d4a0197a49eeb")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "8ced9d1094493f17fb1876b818a64541")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "abb838131e403d39820dbd66932d1ed0")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "f70d8b5358bc2f76696f14b7a807ede0")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "4c0f63e06830681560a1e9f9aad9fe98")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "be2812cd3dae3c326cf35ae3f1c8ad9e")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "8f62aa0e75770204c98d8299793cc53c")}, {new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "03c29a0c1d21f72b12daf51cec111599")}, {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "7080b2cad02ec6e67ebc766b2dccebf8")}, {new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "30e76055c16843b6e33e5b9bd8ced57c")}, diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index d3e77e002..9d12b0ded 100755 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("704888987baacff8c7b273b8ab9938d0")); + Arrays.asList("d20c7a143b899f0239bf64b652ad3edb")); executeTest("test Multiple SNP alleles", spec); } @@ -197,7 +197,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testOutputParameterAllSites() { - testOutputParameters("--output_mode EMIT_ALL_SITES", "81fff490c0f59890f1e75dc290833434"); + testOutputParameters("--output_mode EMIT_ALL_SITES", "8b26088a035e579c4afd3b46737291e4"); } private void testOutputParameters(final String args, final String md5) { @@ -345,7 +345,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSampleIndels1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("a4761d7f25e7a62f34494801c98a0da7")); + Arrays.asList("69df7a00f800204564ca3726e1871132")); List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java index cbe2eb268..96e055e92 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResultUnitTest.java @@ -2,16 +2,14 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.variantcontext.Allele; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.List; +import java.util.*; public class AFCalcResultUnitTest extends BaseTest { private static class MyTest { @@ -79,4 +77,54 @@ public class AFCalcResultUnitTest extends BaseTest { final double[] actualPosteriors = new double[]{result.getLog10PosteriorOfAFEq0(), result.getLog10PosteriorOfAFGT0()}; Assert.assertEquals(MathUtils.sumLog10(actualPosteriors), 1.0, 1e-3, "Posteriors don't sum to 1 with 1e-3 precision"); } + + @DataProvider(name = "TestIsPolymorphic") + public Object[][] makeTestIsPolymorphic() { + List tests = new ArrayList(); + + final List pValues = new LinkedList(); + for ( final double p : Arrays.asList(0.01, 0.1, 0.9, 0.99, 0.999) ) + for ( final double espilon : Arrays.asList(-1e-5, 0.0, 1e-5) ) + pValues.add(p + espilon); + + for ( final double pNonRef : pValues ) { + for ( final double pThreshold : pValues ) { + final boolean shouldBePoly = pNonRef >= pThreshold; + if ( pNonRef != pThreshold) + // let's not deal with numerical instability + tests.add(new Object[]{ pNonRef, pThreshold, shouldBePoly }); + } + } + + return tests.toArray(new Object[][]{}); + } + + private AFCalcResult makePolymorphicTestData(final double pNonRef) { + return new AFCalcResult( + new int[]{0}, + 1, + alleles, + MathUtils.normalizeFromLog10(new double[]{1 - pNonRef, pNonRef}, true, false), + log10Even, + Collections.singletonMap(C, Math.log10(pNonRef))); + } + + @Test(enabled = true, dataProvider = "TestIsPolymorphic") + private void testIsPolymorphic(final double pNonRef, final double pThreshold, final boolean shouldBePoly) { + final AFCalcResult result = makePolymorphicTestData(pNonRef); + final boolean actualIsPoly = result.isPolymorphic(C, Math.log10(pThreshold)); + Assert.assertEquals(actualIsPoly, shouldBePoly, + "isPolymorphic with pNonRef " + pNonRef + " and threshold " + pThreshold + " returned " + + actualIsPoly + " but the expected result is " + shouldBePoly); + } + + @Test(enabled = true, dataProvider = "TestIsPolymorphic") + private void testIsPolymorphicQual(final double pNonRef, final double pThreshold, final boolean shouldBePoly) { + final AFCalcResult result = makePolymorphicTestData(pNonRef); + final double qual = QualityUtils.phredScaleCorrectRate(pThreshold); + final boolean actualIsPoly = result.isPolymorphicPhredScaledQual(C, qual); + Assert.assertEquals(actualIsPoly, shouldBePoly, + "isPolymorphic with pNonRef " + pNonRef + " and threshold " + pThreshold + " returned " + + actualIsPoly + " but the expected result is " + shouldBePoly); + } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index 6828dbcb5..f8ba1f4cc 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -21,17 +21,19 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "56aa4b84606b6b0b7dc78a383974d1b3"); + HCTest(CEUTRIO_BAM, "", "2b39732ff8e0de5bc2ae949aaf7a6f21"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "baabae06c85d416920be434939124d7f"); + HCTest(NA12878_BAM, "", "8b217638ff585effb9cc70e9a9aa544f"); } + // TODO -- add more tests for GGA mode, especially with input alleles that are complex variants and/or not trimmed @Test public void testHaplotypeCallerMultiSampleGGA() { - HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", "39da622b309597d7a0b082c8aa1748c9"); + HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", + "541aa8291f03ba33bd1ad3d731fd5657"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -42,7 +44,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "966d338f423c86a390d685aa6336ec69"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "fd7170cbde7df04d4fbe1da7903c31c6"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -53,7 +55,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "7fbc6b9e27e374f2ffe4be952d88c7c6"); + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "99456fc7207c1fe9f367a0d0afae87cd"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -64,20 +66,20 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "2581e760279291a3901a506d060bfac8"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "6c1631785b3f832aecab1a99f0454762"); } @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("788176e1717bd28fc7cbc8e3efbb6100")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("ec437d2d9f3ae07d155983be0155c8ed")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("96ab8253d242b851ccfc218759f79784")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("237601bbc39694c7413a332cbb656c8e")); executeTest("HCTestStructuralIndels: ", spec); } @@ -91,7 +93,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("425f1a0fb00d7145edf1c55e54346fae")); + Arrays.asList("40bf739fb2b1743642498efe79ea6342")); executeTest("HC calling on a ReducedRead BAM", spec); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java index e82946690..19ced9f42 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngineUnitTest.java @@ -10,7 +10,6 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.MathUtils; import org.testng.Assert; -import org.testng.annotations.BeforeClass; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; @@ -102,7 +101,9 @@ public class LikelihoodCalculationEngineUnitTest extends BaseTest { haplotypes.add(haplotype); } } - return LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(haplotypes, "myTestSample"); + final HashSet sampleSet = new HashSet(1); + sampleSet.add("myTestSample"); + return LikelihoodCalculationEngine.computeDiploidHaplotypeLikelihoods(sampleSet, haplotypes); } } diff --git a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java index d77ae67cf..fb15a3722 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java +++ b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java @@ -33,6 +33,7 @@ import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.help.ApplicationDetails; import org.broadinstitute.sting.utils.help.HelpFormatter; +import org.broadinstitute.sting.utils.help.HelpUtils; import java.io.IOException; import java.util.*; @@ -288,7 +289,7 @@ public abstract class CommandLineProgram { */ private static void printDocumentationReference() { errorPrintf("Visit our website and forum for extensive documentation and answers to %n"); - errorPrintf("commonly asked questions http://www.broadinstitute.org/gatk%n"); + errorPrintf("commonly asked questions " + HelpUtils.BASE_GATK_URL + "%n"); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java index 0daad2c2b..d1711ba4c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java +++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java @@ -39,6 +39,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.ApplicationDetails; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.help.GATKDocUtils; +import org.broadinstitute.sting.utils.help.HelpUtils; import org.broadinstitute.sting.utils.text.TextFormattingUtils; import java.util.*; @@ -118,17 +119,24 @@ public class CommandLineGATK extends CommandLineExecutable { public static final String DISK_QUOTA_EXCEEDED_ERROR = "Disk quota exceeded"; private static void checkForMaskedUserErrors(final Throwable t) { + // masked out of memory error + if ( t instanceof OutOfMemoryError ) + exitSystemWithUserError(new UserException.NotEnoughMemory()); + // masked user error + if ( t instanceof UserException || t instanceof TribbleException ) + exitSystemWithUserError(new UserException(t.getMessage())); + + // no message means no masked error final String message = t.getMessage(); if ( message == null ) return; - // we know what to do about the common "Too many open files" error + // too many open files error if ( message.contains("Too many open files") ) exitSystemWithUserError(new UserException.TooManyOpenFiles()); // malformed BAM looks like a SAM file - if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) || - message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) ) + if ( message.contains(PICARD_TEXT_SAM_FILE_ERROR_1) || message.contains(PICARD_TEXT_SAM_FILE_ERROR_2) ) exitSystemWithSamError(t); // can't close tribble index when writing @@ -138,12 +146,10 @@ public class CommandLineGATK extends CommandLineExecutable { // disk is full if ( message.contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || message.contains(DISK_QUOTA_EXCEEDED_ERROR) ) exitSystemWithUserError(new UserException.NoSpaceOnDevice()); - if ( t.getCause() != null && (t.getCause().getMessage().contains(NO_SPACE_LEFT_ON_DEVICE_ERROR) || t.getCause().getMessage().contains(DISK_QUOTA_EXCEEDED_ERROR)) ) - exitSystemWithUserError(new UserException.NoSpaceOnDevice()); - // masked out of memory error - if ( t.getCause() != null && t.getCause() instanceof OutOfMemoryError ) - exitSystemWithUserError(new UserException.NotEnoughMemory()); + // masked error wrapped in another one + if ( t.getCause() != null ) + checkForMaskedUserErrors(t.getCause()); } /** @@ -155,7 +161,7 @@ public class CommandLineGATK extends CommandLineExecutable { List header = new ArrayList(); header.add(String.format("The Genome Analysis Toolkit (GATK) v%s, Compiled %s",getVersionNumber(), getBuildTime())); header.add("Copyright (c) 2010 The Broad Institute"); - header.add("For support and documentation go to http://www.broadinstitute.org/gatk"); + header.add("For support and documentation go to " + HelpUtils.BASE_GATK_URL); return header; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java index bb788c89f..88de3ac9b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/SAMDataSource.java @@ -30,12 +30,10 @@ import net.sf.samtools.*; import net.sf.samtools.util.CloseableIterator; import net.sf.samtools.util.RuntimeIOException; import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.downsampling.*; -import org.broadinstitute.sting.gatk.downsampling.DownsampleType; -import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.ReadMetrics; import org.broadinstitute.sting.gatk.ReadProperties; import org.broadinstitute.sting.gatk.arguments.ValidationExclusion; +import org.broadinstitute.sting.gatk.downsampling.*; import org.broadinstitute.sting.gatk.filters.CountingFilteringIterator; import org.broadinstitute.sting.gatk.filters.ReadFilter; import org.broadinstitute.sting.gatk.iterators.*; @@ -567,7 +565,7 @@ public class SAMDataSource { * * @return the start positions of the first chunk of reads for all BAM files */ - public Map getInitialReaderPositions() { + protected Map getInitialReaderPositions() { Map initialPositions = new HashMap(); SAMReaders readers = resourcePool.getAvailableReaders(); @@ -585,7 +583,7 @@ public class SAMDataSource { * @param shard The shard specifying the data limits. * @return An iterator over the selected data. */ - public StingSAMIterator getIterator( Shard shard ) { + protected StingSAMIterator getIterator( Shard shard ) { return getIterator(resourcePool.getAvailableReaders(), shard, shard instanceof ReadShard); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java b/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java index 5ca8a1779..89099c587 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java @@ -25,11 +25,9 @@ package org.broadinstitute.sting.gatk.filters; -import com.google.common.base.Function; -import com.google.common.collect.Collections2; -import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.help.GATKDocUtils; +import org.broadinstitute.sting.utils.help.HelpUtils; import java.util.Collection; import java.util.List; @@ -73,7 +71,7 @@ public class FilterManager extends PluginManager { return String.format("Read filter %s not found. Available read filters:%n%n%s%n%n%s",pluginName, userFriendlyListofReadFilters(availableFilters), - "Please consult the GATK Documentation (http://www.broadinstitute.org/gatk/gatkdocs/) for more information."); + "Please consult the GATK Documentation (" + HelpUtils.GATK_DOCS_URL + ") for more information."); } private String userFriendlyListofReadFilters(List> filters) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java index 47bc48f81..6685ee12a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java @@ -271,7 +271,18 @@ public class GATKReport { * @return a simplified GATK report */ public static GATKReport newSimpleReport(final String tableName, final String... columns) { - GATKReportTable table = new GATKReportTable(tableName, "A simplified GATK table report", columns.length); + return newSimpleReportWithDescription(tableName, "A simplified GATK table report", columns); + } + + /** + * @see #newSimpleReport(String, String...) but with a customized description + * @param tableName + * @param desc + * @param columns + * @return + */ + public static GATKReport newSimpleReportWithDescription(final String tableName, final String desc, final String... columns) { + GATKReportTable table = new GATKReportTable(tableName, desc, columns.length); for (String column : columns) { table.addColumn(column, ""); diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java index b51fb17f0..1079d9b91 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportVersion.java @@ -80,6 +80,9 @@ public enum GATKReportVersion { * @return The version as an enum. */ public static GATKReportVersion fromHeader(String header) { + if ( header == null ) + throw new UserException.BadInput("The GATK report has no version specified in the header"); + if (header.startsWith("##:GATKReport.v0.1 ")) return GATKReportVersion.V0_1; diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 4fe83f331..06fc01232 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -34,9 +34,6 @@ public class TraverseActiveRegions extends TraversalEngine workQueue = new LinkedList(); private final LinkedHashSet myReads = new LinkedHashSet(); - // package access for unit testing - ActivityProfile profile; - @Override public String getTraversalUnits() { return "active regions"; @@ -56,7 +53,7 @@ public class TraverseActiveRegions extends TraversalEngine activeRegions = new LinkedList(); - profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); + ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); @@ -83,7 +80,6 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine extends Walker desiredReadStates() { + return EnumSet.of(ActiveRegionReadState.PRIMARY); + } + + public final boolean wantsNonPrimaryReads() { + return desiredReadStates().contains(ActiveRegionReadState.NONPRIMARY); + } + + public boolean wantsExtendedReads() { + return desiredReadStates().contains(ActiveRegionReadState.EXTENDED); + } + + public boolean wantsUnmappedReads() { + return desiredReadStates().contains(ActiveRegionReadState.UNMAPPED); } // Determine probability of active status over the AlignmentContext + @Ensures({"result.isActiveProb >= 0.0", "result.isActiveProb <= 1.0"}) public abstract ActivityProfileResult isActive(final RefMetaDataTracker tracker, final ReferenceContext ref, final AlignmentContext context); // Map over the ActiveRegion diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java index 82596a501..2679a169b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MappingQualityRankSumTest.java @@ -29,7 +29,7 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn final List refQuals, final List altQuals) { if (pileup != null && likelihoodMap == null) { - // no per-read likelihoods available: + // old UG snp-only path through the annotations for ( final PileupElement p : pileup ) { if ( isUsableBase(p) ) { if ( allAlleles.get(0).equals(Allele.create(p.getBase(), true)) ) { @@ -43,14 +43,13 @@ public class MappingQualityRankSumTest extends RankSumTest implements StandardAn } for (Map.Entry> el : likelihoodMap.getLikelihoodReadMap().entrySet()) { final Allele a = PerReadAlleleLikelihoodMap.getMostLikelyAllele(el.getValue()); + // BUGBUG: There needs to be a comparable isUsableBase check here if (a.isNoCall()) continue; // read is non-informative if (a.isReference()) refQuals.add((double)el.getKey().getMappingQuality()); else if (allAlleles.contains(a)) altQuals.add((double)el.getKey().getMappingQuality()); - - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index 0df7aff71..e7c0e6b14 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -49,7 +49,7 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements ActiveR ReadBackedPileup pileup = null; - if (stratifiedContexts != null) { + if (stratifiedContexts != null) { // the old UG SNP-only path through the annotations final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); if ( context != null ) pileup = context.getBasePileup(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java index d01233bb2..334b89f01 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/ReadPosRankSumTest.java @@ -39,7 +39,7 @@ public class ReadPosRankSumTest extends RankSumTest implements StandardAnnotatio final List refQuals, final List altQuals) { if (alleleLikelihoodMap == null) { - // use fast SNP-based version if we don't have per-read allele likelihoods + // use old UG SNP-based version if we don't have per-read allele likelihoods for ( final PileupElement p : pileup ) { if ( isUsableBase(p) ) { int readPos = AlignmentUtils.calcAlignmentByteArrayOffset(p.getRead().getCigar(), p, 0, 0); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java index c4de9ed45..92060b4a3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotator.java @@ -82,7 +82,7 @@ import java.util.*; @Allows(value={DataSource.READS, DataSource.REFERENCE}) @Reference(window=@Window(start=-50,stop=50)) @By(DataSource.REFERENCE) -public class VariantAnnotator extends RodWalker implements AnnotatorCompatible { +public class VariantAnnotator extends RodWalker implements AnnotatorCompatible, TreeReducible { @ArgumentCollection protected StandardVariantContextInputArgumentCollection variantCollection = new StandardVariantContextInputArgumentCollection(); @@ -275,14 +275,6 @@ public class VariantAnnotator extends RodWalker implements Ann return true; } - /** - * Initialize the number of loci processed to zero. - * - * @return 0 - */ - public Integer reduceInit() { return 0; } - - /** * We want reads that span deletions * @@ -323,15 +315,15 @@ public class VariantAnnotator extends RodWalker implements Ann return 1; } - /** - * Increment the number of loci processed. - * - * @param value result of the map. - * @param sum accumulator for the reduce. - * @return the new number of loci processed. - */ - public Integer reduce(Integer value, Integer sum) { - return sum + value; + @Override + public Integer reduceInit() { return 0; } + + @Override + public Integer reduce(Integer value, Integer sum) { return value + sum; } + + @Override + public Integer treeReduce(Integer lhs, Integer rhs) { + return lhs + rhs; } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index 9506510a9..7ce98cf1d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -227,7 +227,7 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche */ public Long map( final ReferenceContext ref, final GATKSAMRecord originalRead, final RefMetaDataTracker metaDataTracker ) { - final GATKSAMRecord read = ReadClipper.hardClipAdaptorSequence(originalRead); + final GATKSAMRecord read = ReadClipper.hardClipSoftClippedBases( ReadClipper.hardClipAdaptorSequence(originalRead) ); if( read.isEmpty() ) { return 0L; } // the whole read was inside the adaptor so skip it RecalUtils.parsePlatformForRead(read, RAC); @@ -268,16 +268,25 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche } protected boolean[] calculateKnownSites( final GATKSAMRecord read, final List features ) { - final int BUFFER_SIZE = 0; final int readLength = read.getReadBases().length; final boolean[] knownSites = new boolean[readLength]; Arrays.fill(knownSites, false); for( final Feature f : features ) { int featureStartOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getStart(), ReadUtils.ClippingTail.LEFT_TAIL, true); // BUGBUG: should I use LEFT_TAIL here? - if( featureStartOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { featureStartOnRead = 0; } + if( featureStartOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { + featureStartOnRead = 0; + } + int featureEndOnRead = ReadUtils.getReadCoordinateForReferenceCoordinate(read.getSoftStart(), read.getCigar(), f.getEnd(), ReadUtils.ClippingTail.LEFT_TAIL, true); - if( featureEndOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { featureEndOnRead = readLength; } - Arrays.fill(knownSites, Math.max(0, featureStartOnRead - BUFFER_SIZE), Math.min(readLength, featureEndOnRead + 1 + BUFFER_SIZE), true); + if( featureEndOnRead == ReadUtils.CLIPPING_GOAL_NOT_REACHED ) { + featureEndOnRead = readLength; + } + + if( featureStartOnRead > readLength ) { + featureStartOnRead = featureEndOnRead = readLength; + } + + Arrays.fill(knownSites, Math.max(0, featureStartOnRead), Math.min(readLength, featureEndOnRead + 1), true); } return knownSites; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java index e5704a1e2..c64482151 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java @@ -102,13 +102,10 @@ public class RecalibrationArgumentCollection { @Argument(fullName = "no_standard_covs", shortName = "noStandard", doc = "Do not use the standard set of covariates, but rather just the ones listed using the -cov argument", required = false) public boolean DO_NOT_USE_STANDARD_COVARIATES = false; - ///////////////////////////// - // Debugging-only Arguments - ///////////////////////////// /** * This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option. */ - @Hidden + @Advanced @Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.") public boolean RUN_WITHOUT_DBSNP = false; @@ -139,6 +136,13 @@ public class RecalibrationArgumentCollection { @Argument(fullName = "indels_context_size", shortName = "ics", doc = "size of the k-mer context to be used for base insertions and deletions", required = false) public int INDELS_CONTEXT_SIZE = 3; + /** + * The cycle covariate will generate an error if it encounters a cycle greater than this value. + * This argument is ignored if the Cycle covariate is not used. + */ + @Argument(fullName = "maximum_cycle_value", shortName = "maxCycle", doc = "the maximum cycle value permitted for the Cycle covariate", required = false) + public int MAXIMUM_CYCLE_VALUE = 500; + /** * A default base qualities to use as a prior (reported quality) in the mismatch covariate model. This value will replace all base qualities in the read for this default value. Negative value turns it off (default is off) */ @@ -176,9 +180,15 @@ public class RecalibrationArgumentCollection { @Argument(fullName = "binary_tag_name", shortName = "bintag", required = false, doc = "the binary tag covariate name if using it") public String BINARY_TAG_NAME = null; + + ///////////////////////////// + // Debugging-only Arguments + ///////////////////////////// + @Hidden @Argument(fullName = "default_platform", shortName = "dP", required = false, doc = "If a read has no platform then default to the provided String. Valid options are illumina, 454, and solid.") public String DEFAULT_PLATFORM = null; + @Hidden @Argument(fullName = "force_platform", shortName = "fP", required = false, doc = "If provided, the platform of EVERY read will be forced to be the provided String. Valid options are illumina, 454, and solid.") public String FORCE_PLATFORM = null; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java index a41e55166..21532823b 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CoverageUtils.java @@ -6,11 +6,10 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.fragments.FragmentCollection; import org.broadinstitute.sting.utils.pileup.PileupElement; -import java.util.Collection; -import java.util.HashMap; -import java.util.Map; +import java.util.*; /** * IF THERE IS NO JAVADOC RIGHT HERE, YELL AT chartl @@ -20,6 +19,21 @@ import java.util.Map; */ public class CoverageUtils { + public enum CountPileupType { + /** + * Count all reads independently (even if from the same fragment). + */ + COUNT_READS, + /** + * Count all fragments (even if the reads that compose the fragment are not consistent at that base). + */ + COUNT_FRAGMENTS, + /** + * Count all fragments (but only if the reads that compose the fragment are consistent at that base). + */ + COUNT_FRAGMENTS_REQUIRE_SAME_BASE + } + /** * Returns the counts of bases from reads with MAPQ > minMapQ and base quality > minBaseQ in the context * as an array of ints, indexed by the index fields of BaseUtils @@ -64,10 +78,10 @@ public class CoverageUtils { } public static Map> - getBaseCountsByPartition(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ, Collection types) { + getBaseCountsByPartition(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ, CountPileupType countType, Collection types) { Map> countsByIDByType = new HashMap>(); - Map countsByRG = getBaseCountsByReadGroup(context,minMapQ,maxMapQ,minBaseQ,maxBaseQ); + Map countsByRG = getBaseCountsByReadGroup(context,minMapQ,maxMapQ,minBaseQ,maxBaseQ,countType); for (DoCOutputType.Partition t : types ) { // iterate through the read group counts and build the type associations for ( Map.Entry readGroupCountEntry : countsByRG.entrySet() ) { @@ -95,31 +109,95 @@ public class CoverageUtils { } } - public static Map getBaseCountsByReadGroup(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ) { + public static Map getBaseCountsByReadGroup(AlignmentContext context, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ, CountPileupType countType) { Map countsByRG = new HashMap(); - for ( PileupElement e : context.getBasePileup() ) { - if ( e.getMappingQual() >= minMapQ && e.getMappingQual() <= maxMapQ && ( e.getQual() >= minBaseQ && e.getQual() <= maxBaseQ || e.isDeletion() ) ) { - SAMReadGroupRecord readGroup = getReadGroup(e.getRead()); - if ( ! countsByRG.keySet().contains(readGroup) ) { - countsByRG.put(readGroup,new int[6]); - updateCounts(countsByRG.get(readGroup),e); - } else { - updateCounts(countsByRG.get(readGroup),e); + + List countPileup = new LinkedList(); + FragmentCollection fpile; + + switch (countType) { + + case COUNT_READS: + for (PileupElement e : context.getBasePileup()) + if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ)) + countPileup.add(e); + break; + + case COUNT_FRAGMENTS: // ignore base identities and put in FIRST base that passes filters: + fpile = context.getBasePileup().getStartSortedPileup().toFragments(); + + for (PileupElement e : fpile.getSingletonReads()) + if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ)) + countPileup.add(e); + + for (List overlappingPair : fpile.getOverlappingPairs()) { + // iterate over all elements in fragment: + for (PileupElement e : overlappingPair) { + if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ)) { + countPileup.add(e); // add the first passing element per fragment + break; + } + } } - } + break; + + case COUNT_FRAGMENTS_REQUIRE_SAME_BASE: + fpile = context.getBasePileup().getStartSortedPileup().toFragments(); + + for (PileupElement e : fpile.getSingletonReads()) + if (countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ)) + countPileup.add(e); + + for (List overlappingPair : fpile.getOverlappingPairs()) { + PileupElement firstElem = null; + PileupElement addElem = null; + + // iterate over all elements in fragment: + for (PileupElement e : overlappingPair) { + if (firstElem == null) + firstElem = e; + else if (e.getBase() != firstElem.getBase()) { + addElem = null; + break; + } + + // will add the first passing element per base-consistent fragment: + if (addElem == null && countElement(e, minMapQ, maxMapQ, minBaseQ, maxBaseQ)) + addElem = e; + } + + if (addElem != null) + countPileup.add(addElem); + } + break; + + default: + throw new UserException("Must use valid CountPileupType"); + } + + for (PileupElement e : countPileup) { + SAMReadGroupRecord readGroup = getReadGroup(e.getRead()); + if (!countsByRG.keySet().contains(readGroup)) + countsByRG.put(readGroup, new int[6]); + + updateCounts(countsByRG.get(readGroup), e); } return countsByRG; } + private static boolean countElement(PileupElement e, int minMapQ, int maxMapQ, byte minBaseQ, byte maxBaseQ) { + return (e.getMappingQual() >= minMapQ && e.getMappingQual() <= maxMapQ && ( e.getQual() >= minBaseQ && e.getQual() <= maxBaseQ || e.isDeletion() )); + } + private static void updateCounts(int[] counts, PileupElement e) { if ( e.isDeletion() ) { - counts[BaseUtils.DELETION_INDEX]++; + counts[BaseUtils.DELETION_INDEX] += e.getRepresentativeCount(); } else if ( BaseUtils.basesAreEqual((byte) 'N', e.getBase()) ) { - counts[BaseUtils.NO_CALL_INDEX]++; + counts[BaseUtils.NO_CALL_INDEX] += e.getRepresentativeCount(); } else { try { - counts[BaseUtils.simpleBaseToBaseIndex(e.getBase())]++; + counts[BaseUtils.simpleBaseToBaseIndex(e.getBase())] += e.getRepresentativeCount(); } catch (ArrayIndexOutOfBoundsException exc) { throw new ReviewedStingException("Expected a simple base, but actually received"+(char)e.getBase()); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java index 44b0d74ca..fe9942662 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/DepthOfCoverage.java @@ -129,11 +129,15 @@ public class DepthOfCoverage extends LocusWalker { /** * Writes out a file of the DiffEngine format: * - * http://www.broadinstitute.org/gsa/wiki/index.php/DiffEngine + * TODO for Geraldine -- link needs to be updated below (and also in SelectVariants and RefSeqCodec GATK docs) + * http://www.broadinstitute.org/gsa/wiki/index.php/DiffEngine */ @Output(doc="File to which results should be written",required=true) protected PrintStream out; diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java index 2b9744b89..22c6097cf 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/fasta/FastaAlternateReferenceMaker.java @@ -47,6 +47,12 @@ import java.util.List; * * Given variant tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s). * Additionally, allows for one or more "snpmask" VCFs to set overlapping bases to 'N'. + * + * The output format can be partially controlled using the provided command-line arguments. + * Specify intervals with the usual -L argument to output only the reference bases within your intervals. + * Overlapping intervals are automatically merged; reference bases for each disjoint interval will be output as a + * separate fasta sequence (named numerically in order). + * * Several important notes: * 1) if there are multiple variants that start at a site, it chooses one of them randomly. * 2) when there are overlapping indels (but with different start positions) only the first will be chosen. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 80bc04845..cc086b148 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -382,11 +382,9 @@ public class UnifiedGenotyperEngine { if ( alternateAllele.isReference() ) continue; - // we are non-ref if the probability of being non-ref > the emit confidence. - // the emit confidence is phred-scaled, say 30 => 10^-3. - // the posterior AF > 0 is log10: -5 => 10^-5 - // we are non-ref if 10^-5 < 10^-3 => -5 < -3 - final boolean isNonRef = AFresult.isPolymorphic(alternateAllele, UAC.STANDARD_CONFIDENCE_FOR_EMITTING / -10.0); + // Compute if the site is considered polymorphic with sufficient confidence relative to our + // phred-scaled emission QUAL + final boolean isNonRef = AFresult.isPolymorphicPhredScaledQual(alternateAllele, UAC.STANDARD_CONFIDENCE_FOR_EMITTING); // if the most likely AC is not 0, then this is a good alternate allele to use if ( isNonRef ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java index a65772444..dbb0e8cdd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.variantcontext.Allele; @@ -234,10 +235,20 @@ public class AFCalcResult { * * @return true if there's enough confidence (relative to log10minPNonRef) to reject AF == 0 */ + @Requires("MathUtils.goodLog10Probability(log10minPNonRef)") public boolean isPolymorphic(final Allele allele, final double log10minPNonRef) { return getLog10PosteriorOfAFGt0ForAllele(allele) >= log10minPNonRef; } + /** + * Same as #isPolymorphic but takes a phred-scaled quality score as input + */ + public boolean isPolymorphicPhredScaledQual(final Allele allele, final double minPNonRefPhredScaledQual) { + if ( minPNonRefPhredScaledQual < 0 ) throw new IllegalArgumentException("phredScaledQual " + minPNonRefPhredScaledQual + " < 0 "); + final double log10Threshold = Math.log10(QualityUtils.qualToProb(minPNonRefPhredScaledQual)); + return isPolymorphic(allele, log10Threshold); + } + /** * Are any of the alleles polymorphic w.r.t. #isPolymorphic? * diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java index d8ae6b28b..eda43e6a5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java @@ -95,7 +95,8 @@ import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFr @DocumentedGATKFeature( groupName = "Variant Discovery Tools", extraDocs = {CommandLineGATK.class} ) public class ReadBackedPhasing extends RodWalker { - private static final boolean DEBUG = false; + @Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information (if -l DEBUG is also specified)", required = false) + protected boolean DEBUG = false; /** * The VCF file we are phasing variants from. * @@ -949,7 +950,7 @@ public class ReadBackedPhasing extends RodWalkerInput * diff --git a/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSession.java b/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSession.java index 480113e1e..830c6590d 100644 --- a/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSession.java +++ b/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSession.java @@ -210,13 +210,23 @@ public class JnaSession implements Session { } public static void setAttribute(Pointer jt, String name, String value) throws DrmaaException { - checkError(LibDrmaa.drmaa_set_attribute(jt, name, value, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + if (getAttrNames().contains(name)) { + checkError(LibDrmaa.drmaa_set_attribute(jt, name, value, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + } + else { + throw new InvalidAttributeValueException("Attribute " + name + " is not supported by this implementation of DRMAA"); + } } public static String getAttribute(Pointer jt, String name) throws DrmaaException { - Memory attrBuffer = new Memory(LibDrmaa.DRMAA_ATTR_BUFFER); - checkError(LibDrmaa.drmaa_get_attribute(jt, name, attrBuffer, LibDrmaa.DRMAA_ATTR_BUFFER_LEN, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); - return attrBuffer.getString(0); + if (getAttrNames().contains(name)) { + Memory attrBuffer = new Memory(LibDrmaa.DRMAA_ATTR_BUFFER); + checkError(LibDrmaa.drmaa_get_attribute(jt, name, attrBuffer, LibDrmaa.DRMAA_ATTR_BUFFER_LEN, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + return attrBuffer.getString(0); + } + else { + throw new InvalidAttributeValueException("Attribute " + name + " is not supported by this implementation of DRMAA"); + } } public static void setVectorAttribute(Pointer jt, String name, Collection values) throws DrmaaException { diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index 6df9c9f1d..4d2c26a79 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -315,6 +315,20 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome return ( comparison == -1 || ( comparison == 0 && this.getStop() < that.getStart() )); } + /** + * Tests whether this genome loc starts at the same position as that. + * + * i.e., do this and that have the same contig and the same start position + * + * @param that genome loc to compare to + * @return true if this and that have the same contig and the same start position + */ + @Requires("that != null") + public final boolean startsAt( GenomeLoc that ) { + int comparison = this.compareContigs(that); + return comparison == 0 && this.getStart() == that.getStart(); + } + /** * Tests whether any portion of this contig is before that contig. * @param that Other contig to test. diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java index a3ffe708c..bf60b4a80 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java @@ -374,7 +374,7 @@ public final class GenomeLocParser { int start = 1; int stop = -1; - final int colonIndex = str.indexOf(":"); + final int colonIndex = str.lastIndexOf(":"); if(colonIndex == -1) { contig = str.substring(0, str.length()); // chr1 stop = Integer.MAX_VALUE; diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java index d11adf9e3..394220106 100755 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java @@ -43,6 +43,9 @@ public class GenomeLocSortedSet extends AbstractSet { // our private storage for the GenomeLoc's private List mArray = new ArrayList(); + // cache this to make overlap checking much more efficient + private int previousOverlapSearchIndex = -1; + /** default constructor */ public GenomeLocSortedSet(GenomeLocParser parser) { this.genomeLocParser = parser; @@ -101,7 +104,7 @@ public class GenomeLocSortedSet extends AbstractSet { * Return the number of bps before loc in the sorted set * * @param loc the location before which we are counting bases - * @return + * @return the number of base pairs over all previous intervals */ public long sizeBeforeLoc(GenomeLoc loc) { long s = 0; @@ -110,7 +113,7 @@ public class GenomeLocSortedSet extends AbstractSet { if ( e.isBefore(loc) ) s += e.size(); else if ( e.isPast(loc) ) - ; // don't do anything + break; // we are done else // loc is inside of s s += loc.getStart() - e.getStart(); } @@ -131,15 +134,43 @@ public class GenomeLocSortedSet extends AbstractSet { * Determine if the given loc overlaps any loc in the sorted set * * @param loc the location to test - * @return + * @return trip if the location overlaps any loc */ public boolean overlaps(final GenomeLoc loc) { - for(final GenomeLoc e : mArray) { - if(e.overlapsP(loc)) { - return true; - } + // edge condition + if ( mArray.isEmpty() ) + return false; + + // use the cached version first + if ( previousOverlapSearchIndex != -1 && overlapsAtOrImmediatelyAfterCachedIndex(loc, true) ) + return true; + + // update the cached index + previousOverlapSearchIndex = Collections.binarySearch(mArray, loc); + + // if it matches an interval exactly, we are done + if ( previousOverlapSearchIndex >= 0 ) + return true; + + // check whether it overlaps the interval before or after the insertion point + previousOverlapSearchIndex = Math.max(0, -1 * previousOverlapSearchIndex - 2); + return overlapsAtOrImmediatelyAfterCachedIndex(loc, false); + } + + private boolean overlapsAtOrImmediatelyAfterCachedIndex(final GenomeLoc loc, final boolean updateCachedIndex) { + // check the cached entry + if ( mArray.get(previousOverlapSearchIndex).overlapsP(loc) ) + return true; + + // check the entry after the cached entry since we may have moved to it + boolean returnValue = false; + if ( previousOverlapSearchIndex < mArray.size() - 1 ) { + returnValue = mArray.get(previousOverlapSearchIndex + 1).overlapsP(loc); + if ( updateCachedIndex ) + previousOverlapSearchIndex++; } - return false; + + return returnValue; } /** @@ -155,7 +186,7 @@ public class GenomeLocSortedSet extends AbstractSet { mArray.add(e); return true; } else { - int loc = Collections.binarySearch(mArray,e); + final int loc = Collections.binarySearch(mArray,e); if (loc >= 0) { throw new ReviewedStingException("Genome Loc Sorted Set already contains the GenomicLoc " + e.toString()); } else { diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index b30d47074..30fdce75d 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -49,7 +49,9 @@ public class Haplotype { private int alignmentStartHapwrtRef; public int leftBreakPoint = 0; public int rightBreakPoint = 0; - + private Allele artificialAllele = null; + private int artificialAllelePosition = -1; + /** * Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual * @@ -71,6 +73,12 @@ public class Haplotype { this(bases, 0); } + protected Haplotype( final byte[] bases, final Allele artificialAllele, final int artificialAllelePosition ) { + this(bases, 0); + this.artificialAllele = artificialAllele; + this.artificialAllelePosition = artificialAllelePosition; + } + public Haplotype( final byte[] bases, final GenomeLoc loc ) { this(bases); this.genomeLocation = loc; @@ -171,8 +179,25 @@ public class Haplotype { this.cigar = cigar; } + public boolean isArtificialHaplotype() { + return artificialAllele != null; + } + + public Allele getArtificialAllele() { + return artificialAllele; + } + + public int getArtificialAllelePosition() { + return artificialAllelePosition; + } + + public void setArtificialAllele(final Allele artificialAllele, final int artificialAllelePosition) { + this.artificialAllele = artificialAllele; + this.artificialAllelePosition = artificialAllelePosition; + } + @Requires({"refInsertLocation >= 0"}) - public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation ) { + public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation, final int genomicInsertLocation ) { // refInsertLocation is in ref haplotype offset coordinates NOT genomic coordinates final int haplotypeInsertLocation = ReadUtils.getReadCoordinateForReferenceCoordinate(alignmentStartHapwrtRef, cigar, refInsertLocation, ReadUtils.ClippingTail.RIGHT_TAIL, true); if( haplotypeInsertLocation == -1 || haplotypeInsertLocation + refAllele.length() >= bases.length ) { // desired change falls inside deletion so don't bother creating a new haplotype @@ -182,7 +207,7 @@ public class Haplotype { newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, 0, haplotypeInsertLocation)); // bases before the variant newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, altAllele.getBases()); // the alt allele of the variant newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, haplotypeInsertLocation + refAllele.length(), bases.length)); // bases after the variant - return new Haplotype(newHaplotypeBases); + return new Haplotype(newHaplotypeBases, altAllele, genomicInsertLocation); } public static class HaplotypeBaseComparator implements Comparator, Serializable { diff --git a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java index 1242e5b00..848beccb8 100755 --- a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java @@ -9,7 +9,7 @@ import net.sf.samtools.SAMUtils; * @author Kiran Garimella */ public class QualityUtils { - public final static byte MAX_RECALIBRATED_Q_SCORE = 93; + public final static byte MAX_RECALIBRATED_Q_SCORE = SAMUtils.MAX_PHRED_SCORE; public final static byte MAX_QUAL_SCORE = SAMUtils.MAX_PHRED_SCORE; public final static double ERROR_RATE_OF_MAX_QUAL_SCORE = qualToErrorProbRaw(MAX_QUAL_SCORE); diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index b780d0966..e4d6f6233 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -293,6 +293,10 @@ public class Utils { } } + public static String join(final String separator, final T ... objects) { + return join(separator, Arrays.asList(objects)); + } + public static String dupString(char c, int nCopies) { char[] chars = new char[nCopies]; Arrays.fill(chars, c); @@ -701,11 +705,13 @@ public class Utils { List oldRecords = header.getProgramRecords(); List newRecords = new ArrayList(oldRecords.size()+1); for ( SAMProgramRecord record : oldRecords ) - if ( !record.getId().startsWith(programRecord.getId()) || KEEP_ALL_PG_RECORDS ) + if ( (programRecord != null && !record.getId().startsWith(programRecord.getId())) || KEEP_ALL_PG_RECORDS ) newRecords.add(record); - newRecords.add(programRecord); - header.setProgramRecords(newRecords); + if (programRecord != null) { + newRecords.add(programRecord); + header.setProgramRecords(newRecords); + } return header; } diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegionReadState.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegionReadState.java new file mode 100644 index 000000000..00e491eb0 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegionReadState.java @@ -0,0 +1,16 @@ +package org.broadinstitute.sting.utils.activeregion; + +/** + * Created with IntelliJ IDEA. + * User: thibault + * Date: 11/26/12 + * Time: 2:35 PM + * + * Describes how a read relates to an assigned ActiveRegion + */ +public enum ActiveRegionReadState { + PRIMARY, // This is the read's primary region + NONPRIMARY, // This region overlaps the read, but it is not primary + EXTENDED, // This region would overlap the read if it were extended + UNMAPPED // This read is not mapped +} diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java index 38cfbb38d..e96eb843d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java @@ -103,11 +103,6 @@ public class ActivityProfile { isActiveList.add(result); } - // for unit testing - public List getActiveList() { - return isActiveList; - } - public int size() { return isActiveList.size(); } diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java b/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java index cf4d699ee..9ad1bf773 100644 --- a/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java +++ b/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java @@ -406,10 +406,15 @@ public class BAQ { // so BQi = Qi - BAQi + 64 byte[] bqTag = new byte[baq.length]; for ( int i = 0; i < bqTag.length; i++) { - int bq = (int)read.getBaseQualities()[i] + 64; - int baq_i = (int)baq[i]; - int tag = bq - baq_i; - if ( tag < 0 ) throw new ReviewedStingException("BAQ tag calculation error. BAQ value above base quality at " + read); + final int bq = (int)read.getBaseQualities()[i] + 64; + final int baq_i = (int)baq[i]; + final int tag = bq - baq_i; + // problem with the calculation of the correction factor; this is our problem + if ( tag < 0 ) + throw new ReviewedStingException("BAQ tag calculation error. BAQ value above base quality at " + read); + // the original quality is too high, almost certainly due to using the wrong encoding in the BAM file + if ( tag > Byte.MAX_VALUE ) + throw new UserException.MalformedBAM(read, "we encountered an extremely high quality score (" + (bq - 64) + ") with BAQ correction factor of " + baq_i + "; the BAM file appears to be using the wrong encoding for quality scores"); bqTag[i] = (byte)tag; } return new String(bqTag); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java index be87e7306..a8aefb703 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java @@ -30,12 +30,17 @@ import net.sf.samtools.SAMSequenceRecord; import org.apache.commons.io.FilenameUtils; import org.apache.log4j.Logger; import org.broad.tribble.Feature; +import org.broad.tribble.FeatureCodecHeader; +import org.broad.tribble.readers.PositionalBufferedStream; import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; import java.util.*; /** @@ -317,4 +322,33 @@ public class VCFUtils { assembly = "hg19"; return assembly; } + + /** + * Read all of the VCF records from source into memory, returning the header and the VariantContexts + * + * @param source the file to read, must be in VCF4 format + * @return + * @throws IOException + */ + public static Pair> readVCF(final File source) throws IOException { + // read in the features + final List vcs = new ArrayList(); + final VCFCodec codec = new VCFCodec(); + PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source)); + FeatureCodecHeader header = codec.readHeader(pbs); + pbs.close(); + + pbs = new PositionalBufferedStream(new FileInputStream(source)); + pbs.skip(header.getHeaderEnd()); + + final VCFHeader vcfHeader = (VCFHeader)header.getHeaderValue(); + + while ( ! pbs.isDone() ) { + final VariantContext vc = codec.decode(pbs); + if ( vc != null ) + vcs.add(vc); + } + + return new Pair>(vcfHeader, vcs); + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index a49a12292..a2ec35ae2 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -30,6 +30,7 @@ import net.sf.samtools.SAMSequenceDictionary; import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpUtils; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -267,7 +268,7 @@ public class UserException extends ReviewedStingException { public static class ReadMissingReadGroup extends MalformedBAM { public ReadMissingReadGroup(SAMRecord read) { - super(read, String.format("Read %s is either missing the read group or its read group is not defined in the BAM header, both of which are required by the GATK. Please use http://www.broadinstitute.org/gsa/wiki/index.php/ReplaceReadGroups to fix this problem", read.getReadName())); + super(read, String.format("Read %s is either missing the read group or its read group is not defined in the BAM header, both of which are required by the GATK. Please use " + HelpUtils.GATK_FORUM_URL + "discussion/59/companion-utilities-replacereadgroups to fix this problem", read.getReadName())); } } @@ -343,7 +344,7 @@ public class UserException extends ReviewedStingException { super(String.format("Lexicographically sorted human genome sequence detected in %s." + "\nFor safety's sake the GATK requires human contigs in karyotypic order: 1, 2, ..., 10, 11, ..., 20, 21, 22, X, Y with M either leading or trailing these contigs." + "\nThis is because all distributed GATK resources are sorted in karyotypic order, and your processing will fail when you need to use these files." - + "\nYou can use the ReorderSam utility to fix this problem: http://www.broadinstitute.org/gsa/wiki/index.php/ReorderSam" + + "\nYou can use the ReorderSam utility to fix this problem: " + HelpUtils.GATK_FORUM_URL + "discussion/58/companion-utilities-reordersam" + "\n %s contigs = %s", name, name, ReadUtils.prettyPrintSequenceRecords(dict))); } diff --git a/public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java b/public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java index fe5f48a48..64238dc73 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java @@ -44,14 +44,13 @@ public class ForumAPIUtils { /** * How we post to the forum */ - private final static String API_URL = "https://gatkforums.broadinstitute.org/api/v1/"; final private static String ACCESS_TOKEN = "access_token="; public static List getPostedTools(String forumKey) { Gson gson = new Gson(); List output = new ArrayList(); - String text = httpGet(API_URL + "categories.json?CategoryIdentifier=tool-bulletin&page=1-100000&" + ACCESS_TOKEN + forumKey); + String text = httpGet(HelpUtils.GATK_FORUM_API_URL + "categories.json?CategoryIdentifier=tool-bulletin&page=1-100000&" + ACCESS_TOKEN + forumKey); APIQuery details = gson.fromJson(text, APIQuery.class); ForumDiscussion[] discussions = details.Discussions; @@ -159,7 +158,7 @@ public class ForumAPIUtils { Gson gson = new Gson(); String data = gson.toJson(post.getPostData()); - httpPost(data, API_URL + "post/discussion.json?" + ACCESS_TOKEN + forumKey); + httpPost(data, HelpUtils.GATK_FORUM_API_URL + "post/discussion.json?" + ACCESS_TOKEN + forumKey); } @@ -167,8 +166,7 @@ public class ForumAPIUtils { class APIQuery { ForumDiscussion[] Discussions; - public APIQuery() { - } + public APIQuery() {} } } diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java b/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java index 4ec2ac6d7..21054a794 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java @@ -28,7 +28,7 @@ public class GATKDocUtils { /** * The URL root for RELEASED GATKDOC units */ - public final static String URL_ROOT_FOR_RELEASE_GATKDOCS = "http://www.broadinstitute.org/gatk/gatkdocs/"; + public final static String URL_ROOT_FOR_RELEASE_GATKDOCS = HelpUtils.GATK_DOCS_URL; /** * The URL root for STABLE GATKDOC units */ diff --git a/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java b/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java index 645ab34c1..1bc20d5a0 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java @@ -32,6 +32,15 @@ import org.broadinstitute.sting.utils.classloader.JVMUtils; import java.lang.reflect.Field; public class HelpUtils { + + public final static String BASE_GATK_URL = "http://www.broadinstitute.org/gatk"; + public final static String GATK_DOCS_URL = BASE_GATK_URL + "/gatkdocs/"; + public final static String GATK_FORUM_URL = "http://gatkforums.broadinstitute.org/"; + public final static String GATK_FORUM_API_URL = "https://gatkforums.broadinstitute.org/api/v1/"; + + + + protected static boolean assignableToClass(ProgramElementDoc classDoc, Class lhsClass, boolean requireConcrete) { try { Class type = getClassForDoc(classDoc); diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java index ed6fc46bb..25f0bfa6d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java @@ -254,19 +254,32 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker) pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); for (final String sample : tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); - AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements).getOverlappingFragmentFilteredPileup(); + AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements).getOverlappingFragmentFilteredPileup(discardDiscordant, baseQualNotMapQual); filteredTracker.addElements(sample, pileup.pileupElementTracker); } return (RBP) createNewPileup(loc, filteredTracker); @@ -284,11 +297,16 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker) pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker
* Given variant tracks, it replaces the reference bases at variation sites with the bases supplied by the ROD(s). * Additionally, allows for one or more "snpmask" VCFs to set overlapping bases to 'N'. + * + * The output format can be partially controlled using the provided command-line arguments. + * Specify intervals with the usual -L argument to output only the reference bases within your intervals. + * Overlapping intervals are automatically merged; reference bases for each disjoint interval will be output as a + * separate fasta sequence (named numerically in order). + * * Several important notes: * 1) if there are multiple variants that start at a site, it chooses one of them randomly. * 2) when there are overlapping indels (but with different start positions) only the first will be chosen. diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 80bc04845..cc086b148 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -382,11 +382,9 @@ public class UnifiedGenotyperEngine { if ( alternateAllele.isReference() ) continue; - // we are non-ref if the probability of being non-ref > the emit confidence. - // the emit confidence is phred-scaled, say 30 => 10^-3. - // the posterior AF > 0 is log10: -5 => 10^-5 - // we are non-ref if 10^-5 < 10^-3 => -5 < -3 - final boolean isNonRef = AFresult.isPolymorphic(alternateAllele, UAC.STANDARD_CONFIDENCE_FOR_EMITTING / -10.0); + // Compute if the site is considered polymorphic with sufficient confidence relative to our + // phred-scaled emission QUAL + final boolean isNonRef = AFresult.isPolymorphicPhredScaledQual(alternateAllele, UAC.STANDARD_CONFIDENCE_FOR_EMITTING); // if the most likely AC is not 0, then this is a good alternate allele to use if ( isNonRef ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java index a65772444..dbb0e8cdd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/afcalc/AFCalcResult.java @@ -28,6 +28,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper.afcalc; import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.variantcontext.Allele; @@ -234,10 +235,20 @@ public class AFCalcResult { * * @return true if there's enough confidence (relative to log10minPNonRef) to reject AF == 0 */ + @Requires("MathUtils.goodLog10Probability(log10minPNonRef)") public boolean isPolymorphic(final Allele allele, final double log10minPNonRef) { return getLog10PosteriorOfAFGt0ForAllele(allele) >= log10minPNonRef; } + /** + * Same as #isPolymorphic but takes a phred-scaled quality score as input + */ + public boolean isPolymorphicPhredScaledQual(final Allele allele, final double minPNonRefPhredScaledQual) { + if ( minPNonRefPhredScaledQual < 0 ) throw new IllegalArgumentException("phredScaledQual " + minPNonRefPhredScaledQual + " < 0 "); + final double log10Threshold = Math.log10(QualityUtils.qualToProb(minPNonRefPhredScaledQual)); + return isPolymorphic(allele, log10Threshold); + } + /** * Are any of the alleles polymorphic w.r.t. #isPolymorphic? * diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java index d8ae6b28b..eda43e6a5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java @@ -95,7 +95,8 @@ import static org.broadinstitute.sting.utils.codecs.vcf.VCFUtils.getVCFHeadersFr @DocumentedGATKFeature( groupName = "Variant Discovery Tools", extraDocs = {CommandLineGATK.class} ) public class ReadBackedPhasing extends RodWalker { - private static final boolean DEBUG = false; + @Argument(fullName="debug", shortName="debug", doc="If specified, print out very verbose debug information (if -l DEBUG is also specified)", required = false) + protected boolean DEBUG = false; /** * The VCF file we are phasing variants from. * @@ -949,7 +950,7 @@ public class ReadBackedPhasing extends RodWalkerInput * diff --git a/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSession.java b/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSession.java index 480113e1e..830c6590d 100644 --- a/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSession.java +++ b/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSession.java @@ -210,13 +210,23 @@ public class JnaSession implements Session { } public static void setAttribute(Pointer jt, String name, String value) throws DrmaaException { - checkError(LibDrmaa.drmaa_set_attribute(jt, name, value, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + if (getAttrNames().contains(name)) { + checkError(LibDrmaa.drmaa_set_attribute(jt, name, value, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + } + else { + throw new InvalidAttributeValueException("Attribute " + name + " is not supported by this implementation of DRMAA"); + } } public static String getAttribute(Pointer jt, String name) throws DrmaaException { - Memory attrBuffer = new Memory(LibDrmaa.DRMAA_ATTR_BUFFER); - checkError(LibDrmaa.drmaa_get_attribute(jt, name, attrBuffer, LibDrmaa.DRMAA_ATTR_BUFFER_LEN, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); - return attrBuffer.getString(0); + if (getAttrNames().contains(name)) { + Memory attrBuffer = new Memory(LibDrmaa.DRMAA_ATTR_BUFFER); + checkError(LibDrmaa.drmaa_get_attribute(jt, name, attrBuffer, LibDrmaa.DRMAA_ATTR_BUFFER_LEN, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + return attrBuffer.getString(0); + } + else { + throw new InvalidAttributeValueException("Attribute " + name + " is not supported by this implementation of DRMAA"); + } } public static void setVectorAttribute(Pointer jt, String name, Collection values) throws DrmaaException { diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index 6df9c9f1d..4d2c26a79 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -315,6 +315,20 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome return ( comparison == -1 || ( comparison == 0 && this.getStop() < that.getStart() )); } + /** + * Tests whether this genome loc starts at the same position as that. + * + * i.e., do this and that have the same contig and the same start position + * + * @param that genome loc to compare to + * @return true if this and that have the same contig and the same start position + */ + @Requires("that != null") + public final boolean startsAt( GenomeLoc that ) { + int comparison = this.compareContigs(that); + return comparison == 0 && this.getStart() == that.getStart(); + } + /** * Tests whether any portion of this contig is before that contig. * @param that Other contig to test. diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java index a3ffe708c..bf60b4a80 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java @@ -374,7 +374,7 @@ public final class GenomeLocParser { int start = 1; int stop = -1; - final int colonIndex = str.indexOf(":"); + final int colonIndex = str.lastIndexOf(":"); if(colonIndex == -1) { contig = str.substring(0, str.length()); // chr1 stop = Integer.MAX_VALUE; diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java index d11adf9e3..394220106 100755 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java @@ -43,6 +43,9 @@ public class GenomeLocSortedSet extends AbstractSet { // our private storage for the GenomeLoc's private List mArray = new ArrayList(); + // cache this to make overlap checking much more efficient + private int previousOverlapSearchIndex = -1; + /** default constructor */ public GenomeLocSortedSet(GenomeLocParser parser) { this.genomeLocParser = parser; @@ -101,7 +104,7 @@ public class GenomeLocSortedSet extends AbstractSet { * Return the number of bps before loc in the sorted set * * @param loc the location before which we are counting bases - * @return + * @return the number of base pairs over all previous intervals */ public long sizeBeforeLoc(GenomeLoc loc) { long s = 0; @@ -110,7 +113,7 @@ public class GenomeLocSortedSet extends AbstractSet { if ( e.isBefore(loc) ) s += e.size(); else if ( e.isPast(loc) ) - ; // don't do anything + break; // we are done else // loc is inside of s s += loc.getStart() - e.getStart(); } @@ -131,15 +134,43 @@ public class GenomeLocSortedSet extends AbstractSet { * Determine if the given loc overlaps any loc in the sorted set * * @param loc the location to test - * @return + * @return trip if the location overlaps any loc */ public boolean overlaps(final GenomeLoc loc) { - for(final GenomeLoc e : mArray) { - if(e.overlapsP(loc)) { - return true; - } + // edge condition + if ( mArray.isEmpty() ) + return false; + + // use the cached version first + if ( previousOverlapSearchIndex != -1 && overlapsAtOrImmediatelyAfterCachedIndex(loc, true) ) + return true; + + // update the cached index + previousOverlapSearchIndex = Collections.binarySearch(mArray, loc); + + // if it matches an interval exactly, we are done + if ( previousOverlapSearchIndex >= 0 ) + return true; + + // check whether it overlaps the interval before or after the insertion point + previousOverlapSearchIndex = Math.max(0, -1 * previousOverlapSearchIndex - 2); + return overlapsAtOrImmediatelyAfterCachedIndex(loc, false); + } + + private boolean overlapsAtOrImmediatelyAfterCachedIndex(final GenomeLoc loc, final boolean updateCachedIndex) { + // check the cached entry + if ( mArray.get(previousOverlapSearchIndex).overlapsP(loc) ) + return true; + + // check the entry after the cached entry since we may have moved to it + boolean returnValue = false; + if ( previousOverlapSearchIndex < mArray.size() - 1 ) { + returnValue = mArray.get(previousOverlapSearchIndex + 1).overlapsP(loc); + if ( updateCachedIndex ) + previousOverlapSearchIndex++; } - return false; + + return returnValue; } /** @@ -155,7 +186,7 @@ public class GenomeLocSortedSet extends AbstractSet { mArray.add(e); return true; } else { - int loc = Collections.binarySearch(mArray,e); + final int loc = Collections.binarySearch(mArray,e); if (loc >= 0) { throw new ReviewedStingException("Genome Loc Sorted Set already contains the GenomicLoc " + e.toString()); } else { diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index b30d47074..30fdce75d 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -49,7 +49,9 @@ public class Haplotype { private int alignmentStartHapwrtRef; public int leftBreakPoint = 0; public int rightBreakPoint = 0; - + private Allele artificialAllele = null; + private int artificialAllelePosition = -1; + /** * Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual * @@ -71,6 +73,12 @@ public class Haplotype { this(bases, 0); } + protected Haplotype( final byte[] bases, final Allele artificialAllele, final int artificialAllelePosition ) { + this(bases, 0); + this.artificialAllele = artificialAllele; + this.artificialAllelePosition = artificialAllelePosition; + } + public Haplotype( final byte[] bases, final GenomeLoc loc ) { this(bases); this.genomeLocation = loc; @@ -171,8 +179,25 @@ public class Haplotype { this.cigar = cigar; } + public boolean isArtificialHaplotype() { + return artificialAllele != null; + } + + public Allele getArtificialAllele() { + return artificialAllele; + } + + public int getArtificialAllelePosition() { + return artificialAllelePosition; + } + + public void setArtificialAllele(final Allele artificialAllele, final int artificialAllelePosition) { + this.artificialAllele = artificialAllele; + this.artificialAllelePosition = artificialAllelePosition; + } + @Requires({"refInsertLocation >= 0"}) - public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation ) { + public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation, final int genomicInsertLocation ) { // refInsertLocation is in ref haplotype offset coordinates NOT genomic coordinates final int haplotypeInsertLocation = ReadUtils.getReadCoordinateForReferenceCoordinate(alignmentStartHapwrtRef, cigar, refInsertLocation, ReadUtils.ClippingTail.RIGHT_TAIL, true); if( haplotypeInsertLocation == -1 || haplotypeInsertLocation + refAllele.length() >= bases.length ) { // desired change falls inside deletion so don't bother creating a new haplotype @@ -182,7 +207,7 @@ public class Haplotype { newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, 0, haplotypeInsertLocation)); // bases before the variant newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, altAllele.getBases()); // the alt allele of the variant newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, haplotypeInsertLocation + refAllele.length(), bases.length)); // bases after the variant - return new Haplotype(newHaplotypeBases); + return new Haplotype(newHaplotypeBases, altAllele, genomicInsertLocation); } public static class HaplotypeBaseComparator implements Comparator, Serializable { diff --git a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java index 1242e5b00..848beccb8 100755 --- a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java @@ -9,7 +9,7 @@ import net.sf.samtools.SAMUtils; * @author Kiran Garimella */ public class QualityUtils { - public final static byte MAX_RECALIBRATED_Q_SCORE = 93; + public final static byte MAX_RECALIBRATED_Q_SCORE = SAMUtils.MAX_PHRED_SCORE; public final static byte MAX_QUAL_SCORE = SAMUtils.MAX_PHRED_SCORE; public final static double ERROR_RATE_OF_MAX_QUAL_SCORE = qualToErrorProbRaw(MAX_QUAL_SCORE); diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index b780d0966..e4d6f6233 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -293,6 +293,10 @@ public class Utils { } } + public static String join(final String separator, final T ... objects) { + return join(separator, Arrays.asList(objects)); + } + public static String dupString(char c, int nCopies) { char[] chars = new char[nCopies]; Arrays.fill(chars, c); @@ -701,11 +705,13 @@ public class Utils { List oldRecords = header.getProgramRecords(); List newRecords = new ArrayList(oldRecords.size()+1); for ( SAMProgramRecord record : oldRecords ) - if ( !record.getId().startsWith(programRecord.getId()) || KEEP_ALL_PG_RECORDS ) + if ( (programRecord != null && !record.getId().startsWith(programRecord.getId())) || KEEP_ALL_PG_RECORDS ) newRecords.add(record); - newRecords.add(programRecord); - header.setProgramRecords(newRecords); + if (programRecord != null) { + newRecords.add(programRecord); + header.setProgramRecords(newRecords); + } return header; } diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegionReadState.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegionReadState.java new file mode 100644 index 000000000..00e491eb0 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegionReadState.java @@ -0,0 +1,16 @@ +package org.broadinstitute.sting.utils.activeregion; + +/** + * Created with IntelliJ IDEA. + * User: thibault + * Date: 11/26/12 + * Time: 2:35 PM + * + * Describes how a read relates to an assigned ActiveRegion + */ +public enum ActiveRegionReadState { + PRIMARY, // This is the read's primary region + NONPRIMARY, // This region overlaps the read, but it is not primary + EXTENDED, // This region would overlap the read if it were extended + UNMAPPED // This read is not mapped +} diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java index 38cfbb38d..e96eb843d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java @@ -103,11 +103,6 @@ public class ActivityProfile { isActiveList.add(result); } - // for unit testing - public List getActiveList() { - return isActiveList; - } - public int size() { return isActiveList.size(); } diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java b/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java index cf4d699ee..9ad1bf773 100644 --- a/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java +++ b/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java @@ -406,10 +406,15 @@ public class BAQ { // so BQi = Qi - BAQi + 64 byte[] bqTag = new byte[baq.length]; for ( int i = 0; i < bqTag.length; i++) { - int bq = (int)read.getBaseQualities()[i] + 64; - int baq_i = (int)baq[i]; - int tag = bq - baq_i; - if ( tag < 0 ) throw new ReviewedStingException("BAQ tag calculation error. BAQ value above base quality at " + read); + final int bq = (int)read.getBaseQualities()[i] + 64; + final int baq_i = (int)baq[i]; + final int tag = bq - baq_i; + // problem with the calculation of the correction factor; this is our problem + if ( tag < 0 ) + throw new ReviewedStingException("BAQ tag calculation error. BAQ value above base quality at " + read); + // the original quality is too high, almost certainly due to using the wrong encoding in the BAM file + if ( tag > Byte.MAX_VALUE ) + throw new UserException.MalformedBAM(read, "we encountered an extremely high quality score (" + (bq - 64) + ") with BAQ correction factor of " + baq_i + "; the BAM file appears to be using the wrong encoding for quality scores"); bqTag[i] = (byte)tag; } return new String(bqTag); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java index be87e7306..a8aefb703 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java @@ -30,12 +30,17 @@ import net.sf.samtools.SAMSequenceRecord; import org.apache.commons.io.FilenameUtils; import org.apache.log4j.Logger; import org.broad.tribble.Feature; +import org.broad.tribble.FeatureCodecHeader; +import org.broad.tribble.readers.PositionalBufferedStream; import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; import java.util.*; /** @@ -317,4 +322,33 @@ public class VCFUtils { assembly = "hg19"; return assembly; } + + /** + * Read all of the VCF records from source into memory, returning the header and the VariantContexts + * + * @param source the file to read, must be in VCF4 format + * @return + * @throws IOException + */ + public static Pair> readVCF(final File source) throws IOException { + // read in the features + final List vcs = new ArrayList(); + final VCFCodec codec = new VCFCodec(); + PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source)); + FeatureCodecHeader header = codec.readHeader(pbs); + pbs.close(); + + pbs = new PositionalBufferedStream(new FileInputStream(source)); + pbs.skip(header.getHeaderEnd()); + + final VCFHeader vcfHeader = (VCFHeader)header.getHeaderValue(); + + while ( ! pbs.isDone() ) { + final VariantContext vc = codec.decode(pbs); + if ( vc != null ) + vcs.add(vc); + } + + return new Pair>(vcfHeader, vcs); + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index a49a12292..a2ec35ae2 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -30,6 +30,7 @@ import net.sf.samtools.SAMSequenceDictionary; import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpUtils; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -267,7 +268,7 @@ public class UserException extends ReviewedStingException { public static class ReadMissingReadGroup extends MalformedBAM { public ReadMissingReadGroup(SAMRecord read) { - super(read, String.format("Read %s is either missing the read group or its read group is not defined in the BAM header, both of which are required by the GATK. Please use http://www.broadinstitute.org/gsa/wiki/index.php/ReplaceReadGroups to fix this problem", read.getReadName())); + super(read, String.format("Read %s is either missing the read group or its read group is not defined in the BAM header, both of which are required by the GATK. Please use " + HelpUtils.GATK_FORUM_URL + "discussion/59/companion-utilities-replacereadgroups to fix this problem", read.getReadName())); } } @@ -343,7 +344,7 @@ public class UserException extends ReviewedStingException { super(String.format("Lexicographically sorted human genome sequence detected in %s." + "\nFor safety's sake the GATK requires human contigs in karyotypic order: 1, 2, ..., 10, 11, ..., 20, 21, 22, X, Y with M either leading or trailing these contigs." + "\nThis is because all distributed GATK resources are sorted in karyotypic order, and your processing will fail when you need to use these files." - + "\nYou can use the ReorderSam utility to fix this problem: http://www.broadinstitute.org/gsa/wiki/index.php/ReorderSam" + + "\nYou can use the ReorderSam utility to fix this problem: " + HelpUtils.GATK_FORUM_URL + "discussion/58/companion-utilities-reordersam" + "\n %s contigs = %s", name, name, ReadUtils.prettyPrintSequenceRecords(dict))); } diff --git a/public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java b/public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java index fe5f48a48..64238dc73 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java @@ -44,14 +44,13 @@ public class ForumAPIUtils { /** * How we post to the forum */ - private final static String API_URL = "https://gatkforums.broadinstitute.org/api/v1/"; final private static String ACCESS_TOKEN = "access_token="; public static List getPostedTools(String forumKey) { Gson gson = new Gson(); List output = new ArrayList(); - String text = httpGet(API_URL + "categories.json?CategoryIdentifier=tool-bulletin&page=1-100000&" + ACCESS_TOKEN + forumKey); + String text = httpGet(HelpUtils.GATK_FORUM_API_URL + "categories.json?CategoryIdentifier=tool-bulletin&page=1-100000&" + ACCESS_TOKEN + forumKey); APIQuery details = gson.fromJson(text, APIQuery.class); ForumDiscussion[] discussions = details.Discussions; @@ -159,7 +158,7 @@ public class ForumAPIUtils { Gson gson = new Gson(); String data = gson.toJson(post.getPostData()); - httpPost(data, API_URL + "post/discussion.json?" + ACCESS_TOKEN + forumKey); + httpPost(data, HelpUtils.GATK_FORUM_API_URL + "post/discussion.json?" + ACCESS_TOKEN + forumKey); } @@ -167,8 +166,7 @@ public class ForumAPIUtils { class APIQuery { ForumDiscussion[] Discussions; - public APIQuery() { - } + public APIQuery() {} } } diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java b/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java index 4ec2ac6d7..21054a794 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java @@ -28,7 +28,7 @@ public class GATKDocUtils { /** * The URL root for RELEASED GATKDOC units */ - public final static String URL_ROOT_FOR_RELEASE_GATKDOCS = "http://www.broadinstitute.org/gatk/gatkdocs/"; + public final static String URL_ROOT_FOR_RELEASE_GATKDOCS = HelpUtils.GATK_DOCS_URL; /** * The URL root for STABLE GATKDOC units */ diff --git a/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java b/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java index 645ab34c1..1bc20d5a0 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java @@ -32,6 +32,15 @@ import org.broadinstitute.sting.utils.classloader.JVMUtils; import java.lang.reflect.Field; public class HelpUtils { + + public final static String BASE_GATK_URL = "http://www.broadinstitute.org/gatk"; + public final static String GATK_DOCS_URL = BASE_GATK_URL + "/gatkdocs/"; + public final static String GATK_FORUM_URL = "http://gatkforums.broadinstitute.org/"; + public final static String GATK_FORUM_API_URL = "https://gatkforums.broadinstitute.org/api/v1/"; + + + + protected static boolean assignableToClass(ProgramElementDoc classDoc, Class lhsClass, boolean requireConcrete) { try { Class type = getClassForDoc(classDoc); diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java index ed6fc46bb..25f0bfa6d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java @@ -254,19 +254,32 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker) pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); for (final String sample : tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); - AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements).getOverlappingFragmentFilteredPileup(); + AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements).getOverlappingFragmentFilteredPileup(discardDiscordant, baseQualNotMapQual); filteredTracker.addElements(sample, pileup.pileupElementTracker); } return (RBP) createNewPileup(loc, filteredTracker); @@ -284,11 +297,16 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker) pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker
diff --git a/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSession.java b/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSession.java index 480113e1e..830c6590d 100644 --- a/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSession.java +++ b/public/java/src/org/broadinstitute/sting/jna/drmaa/v1_0/JnaSession.java @@ -210,13 +210,23 @@ public class JnaSession implements Session { } public static void setAttribute(Pointer jt, String name, String value) throws DrmaaException { - checkError(LibDrmaa.drmaa_set_attribute(jt, name, value, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + if (getAttrNames().contains(name)) { + checkError(LibDrmaa.drmaa_set_attribute(jt, name, value, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + } + else { + throw new InvalidAttributeValueException("Attribute " + name + " is not supported by this implementation of DRMAA"); + } } public static String getAttribute(Pointer jt, String name) throws DrmaaException { - Memory attrBuffer = new Memory(LibDrmaa.DRMAA_ATTR_BUFFER); - checkError(LibDrmaa.drmaa_get_attribute(jt, name, attrBuffer, LibDrmaa.DRMAA_ATTR_BUFFER_LEN, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); - return attrBuffer.getString(0); + if (getAttrNames().contains(name)) { + Memory attrBuffer = new Memory(LibDrmaa.DRMAA_ATTR_BUFFER); + checkError(LibDrmaa.drmaa_get_attribute(jt, name, attrBuffer, LibDrmaa.DRMAA_ATTR_BUFFER_LEN, getError(), LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN)); + return attrBuffer.getString(0); + } + else { + throw new InvalidAttributeValueException("Attribute " + name + " is not supported by this implementation of DRMAA"); + } } public static void setVectorAttribute(Pointer jt, String name, Collection values) throws DrmaaException { diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java index 6df9c9f1d..4d2c26a79 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLoc.java @@ -315,6 +315,20 @@ public class GenomeLoc implements Comparable, Serializable, HasGenome return ( comparison == -1 || ( comparison == 0 && this.getStop() < that.getStart() )); } + /** + * Tests whether this genome loc starts at the same position as that. + * + * i.e., do this and that have the same contig and the same start position + * + * @param that genome loc to compare to + * @return true if this and that have the same contig and the same start position + */ + @Requires("that != null") + public final boolean startsAt( GenomeLoc that ) { + int comparison = this.compareContigs(that); + return comparison == 0 && this.getStart() == that.getStart(); + } + /** * Tests whether any portion of this contig is before that contig. * @param that Other contig to test. diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java index a3ffe708c..bf60b4a80 100644 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocParser.java @@ -374,7 +374,7 @@ public final class GenomeLocParser { int start = 1; int stop = -1; - final int colonIndex = str.indexOf(":"); + final int colonIndex = str.lastIndexOf(":"); if(colonIndex == -1) { contig = str.substring(0, str.length()); // chr1 stop = Integer.MAX_VALUE; diff --git a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java index d11adf9e3..394220106 100755 --- a/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java +++ b/public/java/src/org/broadinstitute/sting/utils/GenomeLocSortedSet.java @@ -43,6 +43,9 @@ public class GenomeLocSortedSet extends AbstractSet { // our private storage for the GenomeLoc's private List mArray = new ArrayList(); + // cache this to make overlap checking much more efficient + private int previousOverlapSearchIndex = -1; + /** default constructor */ public GenomeLocSortedSet(GenomeLocParser parser) { this.genomeLocParser = parser; @@ -101,7 +104,7 @@ public class GenomeLocSortedSet extends AbstractSet { * Return the number of bps before loc in the sorted set * * @param loc the location before which we are counting bases - * @return + * @return the number of base pairs over all previous intervals */ public long sizeBeforeLoc(GenomeLoc loc) { long s = 0; @@ -110,7 +113,7 @@ public class GenomeLocSortedSet extends AbstractSet { if ( e.isBefore(loc) ) s += e.size(); else if ( e.isPast(loc) ) - ; // don't do anything + break; // we are done else // loc is inside of s s += loc.getStart() - e.getStart(); } @@ -131,15 +134,43 @@ public class GenomeLocSortedSet extends AbstractSet { * Determine if the given loc overlaps any loc in the sorted set * * @param loc the location to test - * @return + * @return trip if the location overlaps any loc */ public boolean overlaps(final GenomeLoc loc) { - for(final GenomeLoc e : mArray) { - if(e.overlapsP(loc)) { - return true; - } + // edge condition + if ( mArray.isEmpty() ) + return false; + + // use the cached version first + if ( previousOverlapSearchIndex != -1 && overlapsAtOrImmediatelyAfterCachedIndex(loc, true) ) + return true; + + // update the cached index + previousOverlapSearchIndex = Collections.binarySearch(mArray, loc); + + // if it matches an interval exactly, we are done + if ( previousOverlapSearchIndex >= 0 ) + return true; + + // check whether it overlaps the interval before or after the insertion point + previousOverlapSearchIndex = Math.max(0, -1 * previousOverlapSearchIndex - 2); + return overlapsAtOrImmediatelyAfterCachedIndex(loc, false); + } + + private boolean overlapsAtOrImmediatelyAfterCachedIndex(final GenomeLoc loc, final boolean updateCachedIndex) { + // check the cached entry + if ( mArray.get(previousOverlapSearchIndex).overlapsP(loc) ) + return true; + + // check the entry after the cached entry since we may have moved to it + boolean returnValue = false; + if ( previousOverlapSearchIndex < mArray.size() - 1 ) { + returnValue = mArray.get(previousOverlapSearchIndex + 1).overlapsP(loc); + if ( updateCachedIndex ) + previousOverlapSearchIndex++; } - return false; + + return returnValue; } /** @@ -155,7 +186,7 @@ public class GenomeLocSortedSet extends AbstractSet { mArray.add(e); return true; } else { - int loc = Collections.binarySearch(mArray,e); + final int loc = Collections.binarySearch(mArray,e); if (loc >= 0) { throw new ReviewedStingException("Genome Loc Sorted Set already contains the GenomicLoc " + e.toString()); } else { diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index b30d47074..30fdce75d 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -49,7 +49,9 @@ public class Haplotype { private int alignmentStartHapwrtRef; public int leftBreakPoint = 0; public int rightBreakPoint = 0; - + private Allele artificialAllele = null; + private int artificialAllelePosition = -1; + /** * Create a simple consensus sequence with provided bases and a uniform quality over all bases of qual * @@ -71,6 +73,12 @@ public class Haplotype { this(bases, 0); } + protected Haplotype( final byte[] bases, final Allele artificialAllele, final int artificialAllelePosition ) { + this(bases, 0); + this.artificialAllele = artificialAllele; + this.artificialAllelePosition = artificialAllelePosition; + } + public Haplotype( final byte[] bases, final GenomeLoc loc ) { this(bases); this.genomeLocation = loc; @@ -171,8 +179,25 @@ public class Haplotype { this.cigar = cigar; } + public boolean isArtificialHaplotype() { + return artificialAllele != null; + } + + public Allele getArtificialAllele() { + return artificialAllele; + } + + public int getArtificialAllelePosition() { + return artificialAllelePosition; + } + + public void setArtificialAllele(final Allele artificialAllele, final int artificialAllelePosition) { + this.artificialAllele = artificialAllele; + this.artificialAllelePosition = artificialAllelePosition; + } + @Requires({"refInsertLocation >= 0"}) - public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation ) { + public Haplotype insertAllele( final Allele refAllele, final Allele altAllele, final int refInsertLocation, final int genomicInsertLocation ) { // refInsertLocation is in ref haplotype offset coordinates NOT genomic coordinates final int haplotypeInsertLocation = ReadUtils.getReadCoordinateForReferenceCoordinate(alignmentStartHapwrtRef, cigar, refInsertLocation, ReadUtils.ClippingTail.RIGHT_TAIL, true); if( haplotypeInsertLocation == -1 || haplotypeInsertLocation + refAllele.length() >= bases.length ) { // desired change falls inside deletion so don't bother creating a new haplotype @@ -182,7 +207,7 @@ public class Haplotype { newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, 0, haplotypeInsertLocation)); // bases before the variant newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, altAllele.getBases()); // the alt allele of the variant newHaplotypeBases = ArrayUtils.addAll(newHaplotypeBases, ArrayUtils.subarray(bases, haplotypeInsertLocation + refAllele.length(), bases.length)); // bases after the variant - return new Haplotype(newHaplotypeBases); + return new Haplotype(newHaplotypeBases, altAllele, genomicInsertLocation); } public static class HaplotypeBaseComparator implements Comparator, Serializable { diff --git a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java index 1242e5b00..848beccb8 100755 --- a/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/QualityUtils.java @@ -9,7 +9,7 @@ import net.sf.samtools.SAMUtils; * @author Kiran Garimella */ public class QualityUtils { - public final static byte MAX_RECALIBRATED_Q_SCORE = 93; + public final static byte MAX_RECALIBRATED_Q_SCORE = SAMUtils.MAX_PHRED_SCORE; public final static byte MAX_QUAL_SCORE = SAMUtils.MAX_PHRED_SCORE; public final static double ERROR_RATE_OF_MAX_QUAL_SCORE = qualToErrorProbRaw(MAX_QUAL_SCORE); diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index b780d0966..e4d6f6233 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -293,6 +293,10 @@ public class Utils { } } + public static String join(final String separator, final T ... objects) { + return join(separator, Arrays.asList(objects)); + } + public static String dupString(char c, int nCopies) { char[] chars = new char[nCopies]; Arrays.fill(chars, c); @@ -701,11 +705,13 @@ public class Utils { List oldRecords = header.getProgramRecords(); List newRecords = new ArrayList(oldRecords.size()+1); for ( SAMProgramRecord record : oldRecords ) - if ( !record.getId().startsWith(programRecord.getId()) || KEEP_ALL_PG_RECORDS ) + if ( (programRecord != null && !record.getId().startsWith(programRecord.getId())) || KEEP_ALL_PG_RECORDS ) newRecords.add(record); - newRecords.add(programRecord); - header.setProgramRecords(newRecords); + if (programRecord != null) { + newRecords.add(programRecord); + header.setProgramRecords(newRecords); + } return header; } diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegionReadState.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegionReadState.java new file mode 100644 index 000000000..00e491eb0 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegionReadState.java @@ -0,0 +1,16 @@ +package org.broadinstitute.sting.utils.activeregion; + +/** + * Created with IntelliJ IDEA. + * User: thibault + * Date: 11/26/12 + * Time: 2:35 PM + * + * Describes how a read relates to an assigned ActiveRegion + */ +public enum ActiveRegionReadState { + PRIMARY, // This is the read's primary region + NONPRIMARY, // This region overlaps the read, but it is not primary + EXTENDED, // This region would overlap the read if it were extended + UNMAPPED // This read is not mapped +} diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java index 38cfbb38d..e96eb843d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActivityProfile.java @@ -103,11 +103,6 @@ public class ActivityProfile { isActiveList.add(result); } - // for unit testing - public List getActiveList() { - return isActiveList; - } - public int size() { return isActiveList.size(); } diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java b/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java index cf4d699ee..9ad1bf773 100644 --- a/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java +++ b/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java @@ -406,10 +406,15 @@ public class BAQ { // so BQi = Qi - BAQi + 64 byte[] bqTag = new byte[baq.length]; for ( int i = 0; i < bqTag.length; i++) { - int bq = (int)read.getBaseQualities()[i] + 64; - int baq_i = (int)baq[i]; - int tag = bq - baq_i; - if ( tag < 0 ) throw new ReviewedStingException("BAQ tag calculation error. BAQ value above base quality at " + read); + final int bq = (int)read.getBaseQualities()[i] + 64; + final int baq_i = (int)baq[i]; + final int tag = bq - baq_i; + // problem with the calculation of the correction factor; this is our problem + if ( tag < 0 ) + throw new ReviewedStingException("BAQ tag calculation error. BAQ value above base quality at " + read); + // the original quality is too high, almost certainly due to using the wrong encoding in the BAM file + if ( tag > Byte.MAX_VALUE ) + throw new UserException.MalformedBAM(read, "we encountered an extremely high quality score (" + (bq - 64) + ") with BAQ correction factor of " + baq_i + "; the BAM file appears to be using the wrong encoding for quality scores"); bqTag[i] = (byte)tag; } return new String(bqTag); diff --git a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java index be87e7306..a8aefb703 100755 --- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/VCFUtils.java @@ -30,12 +30,17 @@ import net.sf.samtools.SAMSequenceRecord; import org.apache.commons.io.FilenameUtils; import org.apache.log4j.Logger; import org.broad.tribble.Feature; +import org.broad.tribble.FeatureCodecHeader; +import org.broad.tribble.readers.PositionalBufferedStream; import org.broadinstitute.sting.commandline.RodBinding; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.variantcontext.VariantContext; import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; import java.util.*; /** @@ -317,4 +322,33 @@ public class VCFUtils { assembly = "hg19"; return assembly; } + + /** + * Read all of the VCF records from source into memory, returning the header and the VariantContexts + * + * @param source the file to read, must be in VCF4 format + * @return + * @throws IOException + */ + public static Pair> readVCF(final File source) throws IOException { + // read in the features + final List vcs = new ArrayList(); + final VCFCodec codec = new VCFCodec(); + PositionalBufferedStream pbs = new PositionalBufferedStream(new FileInputStream(source)); + FeatureCodecHeader header = codec.readHeader(pbs); + pbs.close(); + + pbs = new PositionalBufferedStream(new FileInputStream(source)); + pbs.skip(header.getHeaderEnd()); + + final VCFHeader vcfHeader = (VCFHeader)header.getHeaderValue(); + + while ( ! pbs.isDone() ) { + final VariantContext vc = codec.decode(pbs); + if ( vc != null ) + vcs.add(vc); + } + + return new Pair>(vcfHeader, vcs); + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index a49a12292..a2ec35ae2 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -30,6 +30,7 @@ import net.sf.samtools.SAMSequenceDictionary; import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; +import org.broadinstitute.sting.utils.help.HelpUtils; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -267,7 +268,7 @@ public class UserException extends ReviewedStingException { public static class ReadMissingReadGroup extends MalformedBAM { public ReadMissingReadGroup(SAMRecord read) { - super(read, String.format("Read %s is either missing the read group or its read group is not defined in the BAM header, both of which are required by the GATK. Please use http://www.broadinstitute.org/gsa/wiki/index.php/ReplaceReadGroups to fix this problem", read.getReadName())); + super(read, String.format("Read %s is either missing the read group or its read group is not defined in the BAM header, both of which are required by the GATK. Please use " + HelpUtils.GATK_FORUM_URL + "discussion/59/companion-utilities-replacereadgroups to fix this problem", read.getReadName())); } } @@ -343,7 +344,7 @@ public class UserException extends ReviewedStingException { super(String.format("Lexicographically sorted human genome sequence detected in %s." + "\nFor safety's sake the GATK requires human contigs in karyotypic order: 1, 2, ..., 10, 11, ..., 20, 21, 22, X, Y with M either leading or trailing these contigs." + "\nThis is because all distributed GATK resources are sorted in karyotypic order, and your processing will fail when you need to use these files." - + "\nYou can use the ReorderSam utility to fix this problem: http://www.broadinstitute.org/gsa/wiki/index.php/ReorderSam" + + "\nYou can use the ReorderSam utility to fix this problem: " + HelpUtils.GATK_FORUM_URL + "discussion/58/companion-utilities-reordersam" + "\n %s contigs = %s", name, name, ReadUtils.prettyPrintSequenceRecords(dict))); } diff --git a/public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java b/public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java index fe5f48a48..64238dc73 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java @@ -44,14 +44,13 @@ public class ForumAPIUtils { /** * How we post to the forum */ - private final static String API_URL = "https://gatkforums.broadinstitute.org/api/v1/"; final private static String ACCESS_TOKEN = "access_token="; public static List getPostedTools(String forumKey) { Gson gson = new Gson(); List output = new ArrayList(); - String text = httpGet(API_URL + "categories.json?CategoryIdentifier=tool-bulletin&page=1-100000&" + ACCESS_TOKEN + forumKey); + String text = httpGet(HelpUtils.GATK_FORUM_API_URL + "categories.json?CategoryIdentifier=tool-bulletin&page=1-100000&" + ACCESS_TOKEN + forumKey); APIQuery details = gson.fromJson(text, APIQuery.class); ForumDiscussion[] discussions = details.Discussions; @@ -159,7 +158,7 @@ public class ForumAPIUtils { Gson gson = new Gson(); String data = gson.toJson(post.getPostData()); - httpPost(data, API_URL + "post/discussion.json?" + ACCESS_TOKEN + forumKey); + httpPost(data, HelpUtils.GATK_FORUM_API_URL + "post/discussion.json?" + ACCESS_TOKEN + forumKey); } @@ -167,8 +166,7 @@ public class ForumAPIUtils { class APIQuery { ForumDiscussion[] Discussions; - public APIQuery() { - } + public APIQuery() {} } } diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java b/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java index 4ec2ac6d7..21054a794 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java @@ -28,7 +28,7 @@ public class GATKDocUtils { /** * The URL root for RELEASED GATKDOC units */ - public final static String URL_ROOT_FOR_RELEASE_GATKDOCS = "http://www.broadinstitute.org/gatk/gatkdocs/"; + public final static String URL_ROOT_FOR_RELEASE_GATKDOCS = HelpUtils.GATK_DOCS_URL; /** * The URL root for STABLE GATKDOC units */ diff --git a/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java b/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java index 645ab34c1..1bc20d5a0 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java @@ -32,6 +32,15 @@ import org.broadinstitute.sting.utils.classloader.JVMUtils; import java.lang.reflect.Field; public class HelpUtils { + + public final static String BASE_GATK_URL = "http://www.broadinstitute.org/gatk"; + public final static String GATK_DOCS_URL = BASE_GATK_URL + "/gatkdocs/"; + public final static String GATK_FORUM_URL = "http://gatkforums.broadinstitute.org/"; + public final static String GATK_FORUM_API_URL = "https://gatkforums.broadinstitute.org/api/v1/"; + + + + protected static boolean assignableToClass(ProgramElementDoc classDoc, Class lhsClass, boolean requireConcrete) { try { Class type = getClassForDoc(classDoc); diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java index ed6fc46bb..25f0bfa6d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java @@ -254,19 +254,32 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker) pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker(); for (final String sample : tracker.getSamples()) { PileupElementTracker perSampleElements = tracker.getElements(sample); - AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements).getOverlappingFragmentFilteredPileup(); + AbstractReadBackedPileup pileup = createNewPileup(loc, perSampleElements).getOverlappingFragmentFilteredPileup(discardDiscordant, baseQualNotMapQual); filteredTracker.addElements(sample, pileup.pileupElementTracker); } return (RBP) createNewPileup(loc, filteredTracker); @@ -284,11 +297,16 @@ public abstract class AbstractReadBackedPileup tracker = (PerSamplePileupElementTracker) pileupElementTracker; PerSamplePileupElementTracker filteredTracker = new PerSamplePileupElementTracker