diff --git a/public/java/src/org/broadinstitute/sting/utils/Utils.java b/public/java/src/org/broadinstitute/sting/utils/Utils.java index f6edb319f..6ce492c63 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/public/java/src/org/broadinstitute/sting/utils/Utils.java @@ -240,22 +240,34 @@ public class Utils { return ret.toString(); } - //public static String join(String separator, Collection strings) { - // return join( separator, strings.toArray(new String[0]) ); - //} - - public static String join(String separator, Collection objects) { - if(objects.isEmpty()) { + /** + * Returns a string of the form elt1.toString() [sep elt2.toString() ... sep elt.toString()] for a collection of + * elti objects (note there's no actual space between sep and the elti elements). Returns + * "" if collection is empty. If collection contains just elt, then returns elt.toString() + * + * @param separator the string to use to separate objects + * @param objects a collection of objects. the element order is defined by the iterator over objects + * @param the type of the objects + * @return a non-null string + */ + public static String join(final String separator, final Collection objects) { + if (objects.isEmpty()) { // fast path for empty collection return ""; - } - Iterator iter = objects.iterator(); - final StringBuilder ret = new StringBuilder(iter.next().toString()); - while(iter.hasNext()) { - ret.append(separator); - ret.append(iter.next().toString()); - } + } else { + final Iterator iter = objects.iterator(); + final T first = iter.next(); - return ret.toString(); + if ( ! iter.hasNext() ) // fast path for singleton collections + return first.toString(); + else { // full path for 2+ collection that actually need a join + final StringBuilder ret = new StringBuilder(first.toString()); + while(iter.hasNext()) { + ret.append(separator); + ret.append(iter.next().toString()); + } + return ret.toString(); + } + } } public static String dupString(char c, int nCopies) { diff --git a/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java b/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java index ef7cf751e..4f096f86e 100644 --- a/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java +++ b/public/java/src/org/broadinstitute/sting/utils/baq/BAQ.java @@ -5,6 +5,7 @@ import net.sf.picard.reference.ReferenceSequence; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; +import net.sf.samtools.SAMUtils; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -131,19 +132,18 @@ public class BAQ { private final static double EM = 0.33333333333; private final static double EI = 0.25; - private double[][][] EPSILONS = new double[256][256][64]; + private double[][][] EPSILONS = new double[256][256][SAMUtils.MAX_PHRED_SCORE+1]; private void initializeCachedData() { for ( int i = 0; i < 256; i++ ) for ( int j = 0; j < 256; j++ ) - for ( int q = 0; q < 64; q++ ) { - double qual = qual2prob[q < minBaseQual ? minBaseQual : q]; + for ( int q = 0; q <= SAMUtils.MAX_PHRED_SCORE; q++ ) { EPSILONS[i][j][q] = 1.0; } for ( char b1 : "ACGTacgt".toCharArray() ) { for ( char b2 : "ACGTacgt".toCharArray() ) { - for ( int q = 0; q < 64; q++ ) { + for ( int q = 0; q <= SAMUtils.MAX_PHRED_SCORE; q++ ) { double qual = qual2prob[q < minBaseQual ? minBaseQual : q]; double e = Character.toLowerCase(b1) == Character.toLowerCase(b2) ? 1 - qual : qual * EM; EPSILONS[(byte)b1][(byte)b2][q] = e; @@ -152,7 +152,7 @@ public class BAQ { } } - private double calcEpsilon( byte ref, byte read, byte qualB ) { + protected double calcEpsilon( byte ref, byte read, byte qualB ) { return EPSILONS[ref][read][qualB]; } diff --git a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java index 343146d6d..ca9c71eba 100755 --- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java @@ -316,19 +316,22 @@ public class VariantContextUtils { return pruneVariantContext(vc, null); } - public static VariantContext pruneVariantContext(VariantContext vc, Collection keysToPreserve ) { - MutableVariantContext mvc = new MutableVariantContext(vc); + public static VariantContext pruneVariantContext(final VariantContext vc, final Collection keysToPreserve ) { + final MutableVariantContext mvc = new MutableVariantContext(vc); if ( keysToPreserve == null || keysToPreserve.size() == 0 ) mvc.clearAttributes(); else { - Map d = mvc.getAttributes(); + final Map d = mvc.getAttributes(); mvc.clearAttributes(); for ( String key : keysToPreserve ) if ( d.containsKey(key) ) mvc.putAttribute(key, d.get(key)); } + // this must be done as the ID is stored in the attributes field + if ( vc.hasID() ) mvc.setID(vc.getID()); + Collection gs = mvc.getGenotypes().values(); mvc.clearGenotypes(); for ( Genotype g : gs ) { @@ -443,34 +446,6 @@ public class VariantContextUtils { throw new ReviewedStingException(String.format("Couldn't find master VCF %s at %s", masterName, unsortedVCs.iterator().next())); } - - public static VariantContext simpleMerge(GenomeLocParser genomeLocParser, Collection unsortedVCs, byte refBase) { - return simpleMerge(genomeLocParser, unsortedVCs, null, FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED, GenotypeMergeType.UNSORTED, false, false, refBase); - } - - - /** - * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. - * If uniqifySamples is true, the priority order is ignored and names are created by concatenating the VC name with - * the sample name - * - * @param genomeLocParser loc parser - * @param unsortedVCs collection of unsorted VCs - * @param priorityListOfVCs priority list detailing the order in which we should grab the VCs - * @param filteredRecordMergeType merge type for filtered records - * @param genotypeMergeOptions merge option for genotypes - * @param annotateOrigin should we annotate the set it came from? - * @param printMessages should we print messages? - * @param inputRefBase the ref base - * @return new VariantContext - */ - public static VariantContext simpleMerge(GenomeLocParser genomeLocParser, Collection unsortedVCs, List priorityListOfVCs, - FilteredRecordMergeType filteredRecordMergeType, GenotypeMergeType genotypeMergeOptions, - boolean annotateOrigin, boolean printMessages, byte inputRefBase ) { - - return simpleMerge(genomeLocParser, unsortedVCs, priorityListOfVCs, filteredRecordMergeType, genotypeMergeOptions, annotateOrigin, printMessages, "set", false, false); - } - /** * Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided. * If uniqifySamples is true, the priority order is ignored and names are created by concatenating the VC name with @@ -486,12 +461,18 @@ public class VariantContextUtils { * @param setKey the key name of the set * @param filteredAreUncalled are filtered records uncalled? * @param mergeInfoWithMaxAC should we merge in info from the VC with maximum allele count? - * @return new VariantContext + * @return new VariantContext representing the merge of unsortedVCs */ - public static VariantContext simpleMerge(GenomeLocParser genomeLocParser, Collection unsortedVCs, List priorityListOfVCs, - FilteredRecordMergeType filteredRecordMergeType, GenotypeMergeType genotypeMergeOptions, - boolean annotateOrigin, boolean printMessages, String setKey, - boolean filteredAreUncalled, boolean mergeInfoWithMaxAC ) { + public static VariantContext simpleMerge(final GenomeLocParser genomeLocParser, + final Collection unsortedVCs, + final List priorityListOfVCs, + final FilteredRecordMergeType filteredRecordMergeType, + final GenotypeMergeType genotypeMergeOptions, + final boolean annotateOrigin, + final boolean printMessages, + final String setKey, + final boolean filteredAreUncalled, + final boolean mergeInfoWithMaxAC ) { if ( unsortedVCs == null || unsortedVCs.size() == 0 ) return null; @@ -514,26 +495,28 @@ public class VariantContextUtils { return null; // establish the baseline info from the first VC - VariantContext first = VCs.get(0); - String name = first.getSource(); - GenomeLoc loc = getLocation(genomeLocParser,first); + final VariantContext first = VCs.get(0); + final String name = first.getSource(); + final Allele refAllele = determineReferenceAllele(VCs); - Set alleles = new TreeSet(); - Map genotypes = new TreeMap(); - double negLog10PError = -1; - Set filters = new TreeSet(); - Map attributes = new TreeMap(); - Set inconsistentAttributes = new HashSet(); - String rsID = null; + final Set alleles = new TreeSet(); + final Set filters = new TreeSet(); + final Map attributes = new TreeMap(); + final Set inconsistentAttributes = new HashSet(); + final Set variantSources = new HashSet(); // contains the set of sources we found in our set of VCs that are variant + final Set rsIDs = new LinkedHashSet(1); // most of the time there's one id + + GenomeLoc loc = getLocation(genomeLocParser,first); int depth = 0; int maxAC = -1; - Map attributesWithMaxAC = new TreeMap(); + final Map attributesWithMaxAC = new TreeMap(); + double negLog10PError = -1; VariantContext vcWithMaxAC = null; + Map genotypes = new TreeMap(); // counting the number of filtered and variant VCs - int nFiltered = 0, nVariant = 0; + int nFiltered = 0; - Allele refAllele = determineReferenceAllele(VCs); boolean remapped = false; // cycle through and add info from the other VCs, making sure the loc/reference matches @@ -546,7 +529,7 @@ public class VariantContextUtils { loc = getLocation(genomeLocParser,vc); // get the longest location nFiltered += vc.isFiltered() ? 1 : 0; - nVariant += vc.isVariant() ? 1 : 0; + if ( vc.isVariant() ) variantSources.add(vc.getSource()); AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc, alleles); remapped = remapped || alleleMapping.needsRemapping(); @@ -566,8 +549,7 @@ public class VariantContextUtils { // if (vc.hasAttribute(VCFConstants.DEPTH_KEY)) depth += vc.getAttributeAsInt(VCFConstants.DEPTH_KEY, 0); - if (rsID == null && vc.hasID()) - rsID = vc.getID(); + if ( vc.hasID() && ! vc.getID().equals(VCFConstants.EMPTY_ID_FIELD) ) rsIDs.add(vc.getID()); if (mergeInfoWithMaxAC && vc.hasAttribute(VCFConstants.ALLELE_COUNT_KEY)) { String rawAlleleCounts = vc.getAttributeAsString(VCFConstants.ALLELE_COUNT_KEY, null); // lets see if the string contains a , separator @@ -627,17 +609,16 @@ public class VariantContextUtils { if ( filteredRecordMergeType == FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED && nFiltered != VCs.size() ) filters.clear(); - // we care about where the call came from - if ( annotateOrigin ) { + if ( annotateOrigin ) { // we care about where the call came from String setValue; - if ( nFiltered == 0 && nVariant == priorityListOfVCs.size() ) // nothing was unfiltered + if ( nFiltered == 0 && variantSources.size() == priorityListOfVCs.size() ) // nothing was unfiltered setValue = "Intersection"; else if ( nFiltered == VCs.size() ) // everything was filtered out setValue = "FilteredInAll"; - else if ( nVariant == 0 ) // everyone was reference + else if ( variantSources.isEmpty() ) // everyone was reference setValue = "ReferenceInAll"; - else { // we are filtered in some subset - List s = new ArrayList(); + else { + LinkedHashSet s = new LinkedHashSet(); for ( VariantContext vc : VCs ) if ( vc.isVariant() ) s.add( vc.isFiltered() ? "filterIn" + vc.getSource() : vc.getSource() ); @@ -652,8 +633,10 @@ public class VariantContextUtils { if ( depth > 0 ) attributes.put(VCFConstants.DEPTH_KEY, String.valueOf(depth)); - if ( rsID != null ) - attributes.put(VariantContext.ID_KEY, rsID); + + if ( ! rsIDs.isEmpty() ) { + attributes.put(VariantContext.ID_KEY, Utils.join(",", rsIDs)); + } VariantContext merged = new VariantContext(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, negLog10PError, filters, (mergeInfoWithMaxAC ? attributesWithMaxAC : attributes) ); // Trim the padded bases of all alleles if necessary diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java index 35495d797..ac5a87a1f 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/CombineVariantsIntegrationTest.java @@ -96,8 +96,8 @@ public class CombineVariantsIntegrationTest extends WalkerTest { @Test public void uniqueSNPs() { combine2("pilot2.snps.vcf4.genotypes.vcf", "yri.trio.gatk_glftrio.intersection.annotated.filtered.chr1.vcf", "", "89f55abea8f59e39d1effb908440548c"); } - @Test public void omniHM3Union() { combineSites(" -filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED", "4836086891f6cbdd40eebef3076d215a"); } - @Test public void omniHM3Intersect() { combineSites(" -filteredRecordsMergeType KEEP_IF_ALL_UNFILTERED", "6a34b5d743efda8b2f3b639f3a2f5de8"); } + @Test public void omniHM3Union() { combineSites(" -filteredRecordsMergeType KEEP_IF_ANY_UNFILTERED", "c6adeda751cb2a08690dd9202356629f"); } + @Test public void omniHM3Intersect() { combineSites(" -filteredRecordsMergeType KEEP_IF_ALL_UNFILTERED", "3a08fd5ee18993dfc8882156ccf5d2e9"); } @Test public void threeWayWithRefs() { WalkerTestSpec spec = new WalkerTestSpec( @@ -131,4 +131,13 @@ public class CombineVariantsIntegrationTest extends WalkerTest { @Test public void complexTestMinimal() { combineComplexSites(" -minimalVCF", "df96cb3beb2dbb5e02f80abec7d3571e"); } @Test public void complexTestSitesOnly() { combineComplexSites(" -sites_only", "f704caeaaaed6711943014b847fe381a"); } @Test public void complexTestSitesOnlyMinimal() { combineComplexSites(" -sites_only -minimalVCF", "f704caeaaaed6711943014b847fe381a"); } + + @Test + public void combineDBSNPDuplicateSites() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T CombineVariants -NO_HEADER -L 1:902000-903000 -o %s -R " + b37KGReference + " -V:v1 " + b37dbSNP132, + 1, + Arrays.asList("")); + executeTest("combineDBSNPDuplicateSites:", spec); + } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java index 2e4dac6da..67943ccb4 100644 --- a/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/baq/BAQUnitTest.java @@ -172,6 +172,17 @@ public class BAQUnitTest extends BaseTest { } } + @Test(enabled = true) + public void testBAQQualRange() { + BAQ baq = new BAQ(1e-3, 0.1, 7, (byte)4, false); // matches current samtools parameters + final byte ref = (byte)'A'; + final byte alt = (byte)'A'; + + for ( int i = 0; i <= SAMUtils.MAX_PHRED_SCORE; i++ ) + Assert.assertTrue(baq.calcEpsilon( ref, alt, (byte)i) >= 0.0, "Failed to get baq epsilon range"); + } + + public void testBAQ(BAQTest test, boolean lookupWithFasta) { BAQ baqHMM = new BAQ(1e-3, 0.1, 7, (byte)4, false); // matches current samtools parameters