From d379c3763a1dcbde79356fa7c4d659554d28c36d Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 29 Feb 2012 18:04:02 -0500 Subject: [PATCH 06/32] DNA Sequence to BitSet and vice-versa conversion tools * Turns DNA sequences (for context covariates) into bit sets for maximum compression * Allows variable context size representation guaranteeing uniqueness. * Works with long precision, so it is limited to a context size of 31 bases (can be extended with BigNumber precision if necessary). * Unit Tests added --- .../broadinstitute/sting/utils/MathUtils.java | 111 ++++++++++++++++-- .../sting/utils/MathUtilsUnitTest.java | 32 ++++- 2 files changed, 127 insertions(+), 16 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index c9ab3b58e..6f2db67ee 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -29,6 +29,7 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import java.math.BigDecimal; @@ -1619,30 +1620,120 @@ public class MathUtils { * @param bitSet the bitset * @return an integer with the bitset representation */ - public static int intFrom(final BitSet bitSet) { - int integer = 0; + public static long intFrom(final BitSet bitSet) { + long number = 0; for (int bitIndex = bitSet.nextSetBit(0); bitIndex >= 0; bitIndex = bitSet.nextSetBit(bitIndex+1)) - integer |= 1 << bitIndex; + number |= 1L << bitIndex; - return integer; + return number; } /** * Creates a BitSet representation of a given integer * - * @param integer the number to turn into a bitset + * @param number the number to turn into a bitset * @return a bitset representation of the integer */ - public static BitSet bitSetFrom(int integer) { - BitSet bitSet = new BitSet((int) Math.ceil(Math.sqrt(integer))); + public static BitSet bitSetFrom(long number) { + BitSet bitSet = new BitSet(); int bitIndex = 0; - while (integer > 0) { - if (integer%2 > 0) + while (number > 0) { + if (number%2 > 0) bitSet.set(bitIndex); bitIndex++; - integer /= 2; + number /= 2; } return bitSet; } + /** + * Converts a BitSet into the dna string representation. + * + * Warning: This conversion is limited to long precision, therefore the dna sequence cannot + * be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create + * a bitSetFrom(BigNumber) method. + * + * We calculate the length of the resulting DNA sequence by looking at the sum(4^i) that exceeds the + * base_10 representation of the sequence. This is important for us to know how to bring the number + * to a quasi-canonical base_4 representation, and to fill in leading A's (since A's are represented + * as 0's and leading 0's are omitted). + * + * quasi-canonical because A is represented by a 0, therefore, + * instead of : 0, 1, 2, 3, 10, 11, 12, ... + * we have : 0, 1, 2, 3, 00, 01, 02, ... + * + * but we can correctly decode it because we know the final length. + * + * @param bitSet the bitset representation of the dna sequence + * @return the dna sequence represented by the bitset + */ + public static String dnaFrom(final BitSet bitSet) { + long number = intFrom(bitSet); // the base_10 representation of the bit set + long preContext = 0; // the number of combinations skipped to get to the quasi-canonical representation (we keep it to subtract later) + long nextContext = 4; // the next context (we advance it so we know which one was preceding it). + int i = 1; // the calculated length of the DNA sequence given the base_10 representation of its BitSet. + while (nextContext <= number) { // find the length of the dna string (i) + preContext = nextContext; // keep track of the number of combinations in the preceding context + nextContext += Math.pow(4, ++i);// calculate the next context + } + number -= preContext; // subtract the the number of combinations of the preceding context from the number to get to the quasi-canonical representation + + String dna = ""; + while (number > 0) { // perform a simple base_10 to base_4 conversion (quasi-canonical) + byte base = (byte) (number % 4); + switch (base) { + case 0 : dna = "A" + dna; break; + case 1 : dna = "C" + dna; break; + case 2 : dna = "G" + dna; break; + case 3 : dna = "T" + dna; break; + } + number /= 4; + } + for (int j = dna.length(); j < i; j++) + dna = "A" + dna; // add leading A's as necessary (due to the "quasi" canonical status, see description above) + + return dna; + } + + /** + * Creates a BitSet representation of a given dna string. + * + * Warning: This conversion is limited to long precision, therefore the dna sequence cannot + * be longer than 31 bases. To increase this limit, use BigNumbers instead of long and create + * a bitSetFrom(BigNumber) method. + * + * The bit representation of a dna string is the simple: + * 0 A 4 AA 8 CA + * 1 C 5 AC ... + * 2 G 6 AG 1343 TTGGT + * 3 T 7 AT 1364 TTTTT + * + * To convert from dna to number, we convert the dna string to base10 and add all combinations that + * preceded the string (with smaller lengths). + * + * @param dna the dna sequence + * @return the bitset representing the dna sequence + */ + public static BitSet bitSetFrom(String dna) { + if (dna.length() > 31) + throw new ReviewedStingException(String.format("DNA Length cannot be bigger than 31. dna: %s (%d)", dna, dna.length())); + + long baseTen = 0; // the number in base_10 that we are going to use to generate the bit set + long preContext = 0; // the sum of all combinations that preceded the length of the dna string + for (int i=0; i0) + preContext += Math.pow(4, i); // each length will have 4^i combinations (e.g 1 = 4, 2 = 16, 3 = 64, ...) + } + + return bitSetFrom(baseTen+preContext); // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length. + } + + } diff --git a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java index 5b50c91a6..75fc44873 100755 --- a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java @@ -207,14 +207,34 @@ public class MathUtilsUnitTest extends BaseTest { @Test(enabled = true) public void testIntAndBitSetConversion() { - Assert.assertEquals(428, MathUtils.intFrom(MathUtils.bitSetFrom(428))); - Assert.assertEquals(239847, MathUtils.intFrom(MathUtils.bitSetFrom(239847))); - Assert.assertEquals(12726, MathUtils.intFrom(MathUtils.bitSetFrom(12726))); - Assert.assertEquals(0, MathUtils.intFrom(MathUtils.bitSetFrom(0))); - Assert.assertEquals(1, MathUtils.intFrom(MathUtils.bitSetFrom(1))); - Assert.assertEquals(65536, MathUtils.intFrom(MathUtils.bitSetFrom(65536))); + Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(428)), 428); + Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(239847)), 239847); + Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(12726)), 12726); + Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(0)), 0); + Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(1)), 1); + Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(65536)), 65536); + Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(Long.MAX_VALUE)), Long.MAX_VALUE); } + @Test(enabled = true) + public void testDNAAndBitSetConversion() { + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("ACGT")), "ACGT"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AGGTGTTGT")), "AGGTGTTGT"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("A")), "A"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("C")), "C"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("G")), "G"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("T")), "T"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("CC")), "CC"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AA")), "AA"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AAAA")), "AAAA"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("CCCCCCCCCCCCCC")), "CCCCCCCCCCCCCC"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("GGGGGGGGGGGGGG")), "GGGGGGGGGGGGGG"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("TTTTTTTTTTTTTT")), "TTTTTTTTTTTTTT"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("GTAGACCGATCTCAGCTAGT")), "GTAGACCGATCTCAGCTAGT"); + Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AACGTCAATGCAGTCAAGTCAGACGTGGGTT")), "AACGTCAATGCAGTCAAGTCAGACGTGGGTT"); // testing max precision (length == 31) + } + + private boolean hasUniqueElements(Object[] x) { for (int i = 0; i < x.length; i++) for (int j = i + 1; j < x.length; j++) From 9e95b10789ae09600675c2da11a3ef6e8b76fac0 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Wed, 29 Feb 2012 18:56:11 -0500 Subject: [PATCH 07/32] Context covariate now operates as a highly compressed bitset * All contexts with 'N' bases are now collapsed as uninformative * Context size is now represented internally as a BitSet but output as a dna string * Temporarily disabled sorted outputs because of null objects --- .../gatk/walkers/bqsr/ContextCovariate.java | 54 +++++++++---------- .../sting/gatk/walkers/bqsr/Covariate.java | 2 +- .../gatk/walkers/bqsr/CovariateValues.java | 14 ++--- .../gatk/walkers/bqsr/CycleCovariate.java | 2 +- .../walkers/bqsr/QualityScoreCovariate.java | 4 +- .../gatk/walkers/bqsr/ReadGroupCovariate.java | 2 +- 6 files changed, 35 insertions(+), 43 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java index 89a30e4f5..a1ab73341 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariate.java @@ -26,10 +26,12 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.utils.BaseUtils; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.Arrays; +import java.util.BitSet; /** * Created by IntelliJ IDEA. @@ -43,10 +45,6 @@ public class ContextCovariate implements StandardCovariate { private int insertionsContextSize; private int deletionsContextSize; - private String mismatchesNoContext = ""; - private String insertionsNoContext = ""; - private String deletionsNoContext = ""; - // Initialize any member variables using the command-line arguments passed to the walkers @Override public void initialize(final RecalibrationArgumentCollection RAC) { @@ -57,29 +55,26 @@ public class ContextCovariate implements StandardCovariate { if (mismatchesContextSize <= 0 || insertionsContextSize <= 0 || deletionsContextSize <= 0) throw new UserException(String.format("Context Size must be positive, if you don't want to use the context covariate, just turn it off instead. Mismatches: %d Insertions: %d Deletions:%d", mismatchesContextSize, insertionsContextSize, deletionsContextSize)); - // initialize no context strings given the size of the context for each covariate type - mismatchesNoContext = makeAllNStringWithLength(mismatchesContextSize); - insertionsNoContext = makeAllNStringWithLength(insertionsContextSize); - deletionsNoContext = makeAllNStringWithLength( deletionsContextSize); } @Override public CovariateValues getValues(final GATKSAMRecord read) { int l = read.getReadLength(); - String[] mismatches = new String [l]; - String[] insertions = new String [l]; - String[] deletions = new String [l]; + BitSet[] mismatches = new BitSet[l]; + BitSet[] insertions = new BitSet[l]; + BitSet[] deletions = new BitSet[l]; final boolean negativeStrand = read.getReadNegativeStrandFlag(); byte[] bases = read.getReadBases(); - if (negativeStrand) { - bases = BaseUtils.simpleReverseComplement(bases); //this is NOT in-place - } + if (negativeStrand) + bases = BaseUtils.simpleReverseComplement(bases); + for (int i = 0; i < read.getReadLength(); i++) { - mismatches[i] = contextWith(bases, i, mismatchesContextSize, mismatchesNoContext); - insertions[i] = contextWith(bases, i, insertionsContextSize, insertionsNoContext); - deletions[i] = contextWith(bases, i, deletionsContextSize, deletionsNoContext); + mismatches[i] = contextWith(bases, i, mismatchesContextSize); + insertions[i] = contextWith(bases, i, insertionsContextSize); + deletions[i] = contextWith(bases, i, deletionsContextSize); } + if (negativeStrand) { reverse(mismatches); reverse(insertions); @@ -90,7 +85,7 @@ public class ContextCovariate implements StandardCovariate { // Used to get the covariate's value from input csv file during on-the-fly recalibration @Override - public final Comparable getValue(final String str) { + public final Object getValue(final String str) { return str; } @@ -100,29 +95,28 @@ public class ContextCovariate implements StandardCovariate { * @param bases the bases in the read to build the context from * @param offset the position in the read to calculate the context for * @param contextSize context size to use building the context - * @param noContextString string to return if the position is not far enough in the read to have a full context before. * @return */ - private String contextWith(byte [] bases, int offset, int contextSize, String noContextString) { - return (offset < contextSize) ? noContextString : new String(Arrays.copyOfRange(bases, offset - contextSize, offset)); + private BitSet contextWith(byte [] bases, int offset, int contextSize) { + if (offset < contextSize) + return null; + + String context = new String(Arrays.copyOfRange(bases, offset - contextSize, offset)); + if (context.contains("N")) + return null; + + return MathUtils.bitSetFrom(context); } - - private String makeAllNStringWithLength(int length) { - String s = ""; - for (int i=0; i Date: Thu, 1 Mar 2012 15:01:11 -0500 Subject: [PATCH 10/32] ReadGroupProperties walker and associated infrastructure -- ReadGroupProperties: Emits a GATKReport containing read group, sample, library, platform, center, median insert size and median read length for each read group in every BAM file. -- Median tool that collects up to a given maximum number of elements and returns the median of the elements. -- Unit and integration tests for everything. -- Making name of TestProvider protected so subclasses and override name more easily --- .../diagnostics/ReadGroupProperties.java | 192 ++++++++++++++++++ .../broadinstitute/sting/utils/Median.java | 93 +++++++++ .../org/broadinstitute/sting/BaseTest.java | 2 +- .../broadinstitute/sting/MedianUnitTest.java | 123 +++++++++++ .../ReadGroupPropertiesIntegrationTest.java | 44 ++++ 5 files changed, 453 insertions(+), 1 deletion(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/Median.java create mode 100644 public/java/test/org/broadinstitute/sting/MedianUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java new file mode 100644 index 000000000..85f587aaf --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diagnostics; + +import net.sf.samtools.SAMReadGroupRecord; +import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Output; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.ReadMetaDataTracker; +import org.broadinstitute.sting.gatk.report.GATKReport; +import org.broadinstitute.sting.gatk.report.GATKReportTable; +import org.broadinstitute.sting.gatk.walkers.ReadWalker; +import org.broadinstitute.sting.utils.Median; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.io.PrintStream; +import java.util.HashMap; +import java.util.Map; + +/** + * Emits a GATKReport containing read group, sample, library, platform, center, median insert size and + * median read length for each read group in every BAM file. + * + * Note that this walker stops when all read groups have been observed at least a few thousand times so that + * the median statistics are well determined. It is safe to run it WG and it'll finish in an appropriate + * timeframe. + * + *

Input

+ *

+ * Any number of BAM files + *

+ * + *

Output

+ *

+ * GATKReport containing read group, sample, library, platform, center, median insert size and median read length. + * + * For example, running this tool on the NA12878 data sets: + * + *

+ *      ##:GATKReport.v0.2 ReadGroupProperties : Table of read group properties
+ *      readgroup  sample   library       platform  center  median.read.length  median.insert.size
+ *      20FUK.1    NA12878  Solexa-18483  illumina  BI                     101                 387
+ *      20FUK.2    NA12878  Solexa-18484  illumina  BI                     101                 415
+ *      20FUK.3    NA12878  Solexa-18483  illumina  BI                     101                 388
+ *      20FUK.4    NA12878  Solexa-18484  illumina  BI                     101                 415
+ *      20FUK.5    NA12878  Solexa-18483  illumina  BI                     101                 387
+ *      20FUK.6    NA12878  Solexa-18484  illumina  BI                     101                 415
+ *      20FUK.7    NA12878  Solexa-18483  illumina  BI                     101                 388
+ *      20FUK.8    NA12878  Solexa-18484  illumina  BI                     101                 415
+ *      20GAV.1    NA12878  Solexa-18483  illumina  BI                     101                 388
+ *      20GAV.2    NA12878  Solexa-18484  illumina  BI                     101                 415
+ *      20GAV.3    NA12878  Solexa-18483  illumina  BI                     101                 388
+ *      20GAV.4    NA12878  Solexa-18484  illumina  BI                     101                 416
+ *      20GAV.5    NA12878  Solexa-18483  illumina  BI                     101                 388
+ *      20GAV.6    NA12878  Solexa-18484  illumina  BI                     101                 415
+ *      20GAV.7    NA12878  Solexa-18483  illumina  BI                     101                 387
+ *      20GAV.8    NA12878  Solexa-18484  illumina  BI                     101                 414
+ *      
+ *

+ * + *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T ReadGroupProperties
+ *      -I example1.bam -I example2.bam etc
+ *      -R reference.fasta
+ *      -o example.gatkreport.txt
+ *  
+ * + * @author Mark DePristo + */ + + + +public class ReadGroupProperties extends ReadWalker { + @Output + public PrintStream out; + + @Argument(shortName="maxElementsForMedian", doc="Calculate median from the first maxElementsForMedian values observed", required=false) + public int MAX_VALUES_FOR_MEDIAN = 10000; + + private final static String TABLE_NAME = "ReadGroupProperties"; + private final Map> readLengths = new HashMap>(); + private final Map> insertSizes = new HashMap>(); + + @Override + public void initialize() { + for ( final SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { + readLengths.put(rg.getId(), new Median(MAX_VALUES_FOR_MEDIAN)); + insertSizes.put(rg.getId(), new Median(MAX_VALUES_FOR_MEDIAN)); + } + } + + @Override + public boolean filter(ReferenceContext ref, GATKSAMRecord read) { + return ! (read.getReadFailsVendorQualityCheckFlag() || read.getReadUnmappedFlag()); + } + + @Override + public boolean isDone() { + // TODO -- this is far too slow! + return ! (anyMedianNeedsData(readLengths) || anyMedianNeedsData(insertSizes)); + } + + private final boolean anyMedianNeedsData(Map> medianMap) { + for ( Median median : medianMap.values() ) { + if ( ! median.isFull() ) + return true; + } + + return false; + } + + private final void updateMedian(final Median median, final int value) { + median.add(value); + } + + @Override + public Integer map(ReferenceContext referenceContext, GATKSAMRecord read, ReadMetaDataTracker readMetaDataTracker) { + final String rg = read.getReadGroup().getId(); + + updateMedian(readLengths.get(rg), read.getReadLength()); + if ( read.getReadPairedFlag() && read.getInferredInsertSize() != 0) { + //logger.info(rg + " => " + Math.abs(read.getInferredInsertSize())); + updateMedian(insertSizes.get(rg), Math.abs(read.getInferredInsertSize())); + } + + return null; + } + + @Override + public Integer reduceInit() { + return null; + } + + @Override + public Integer reduce(Integer integer, Integer integer1) { + return null; + } + + @Override + public void onTraversalDone(Integer sum) { + final GATKReport report = new GATKReport(); + report.addTable(TABLE_NAME, "Table of read group properties"); + GATKReportTable table = report.getTable(TABLE_NAME); + + table.addPrimaryKey("readgroup"); + //* Emits a GATKReport containing read group, sample, library, platform, center, median insert size and + //* median read length for each read group in every BAM file. + table.addColumn("sample", "NA"); + table.addColumn("library", "NA"); + table.addColumn("platform", "NA"); + table.addColumn("center", "NA"); + table.addColumn("median.read.length", Integer.valueOf(0)); + table.addColumn("median.insert.size", Integer.valueOf(0)); + + for ( final SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { + final String rgID = rg.getId(); + table.set(rgID, "sample", rg.getSample()); + table.set(rgID, "library", rg.getLibrary()); + table.set(rgID, "platform", rg.getPlatform()); + table.set(rgID, "center", rg.getSequencingCenter()); + table.set(rgID, "median.read.length", readLengths.get(rgID).getMedian(0)); + table.set(rgID, "median.insert.size", insertSizes.get(rgID).getMedian(0)); + } + + report.print(out); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/Median.java b/public/java/src/org/broadinstitute/sting/utils/Median.java new file mode 100644 index 000000000..7ebe8d2d7 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/Median.java @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils; + +import java.util.*; + +/** + * Utility class for calculating median from a data set, potentially limiting the size of data to a + * fixed amount + * + * @author Your Name + * @since Date created + */ +public class Median { + final List values; + final int maxValuesToKeep; + boolean sorted = false; + + public Median() { + this(Integer.MAX_VALUE); + } + + public Median(final int maxValuesToKeep) { + this.maxValuesToKeep = maxValuesToKeep; + this.values = new ArrayList(); + } + + public boolean isFull() { + return values.size() >= maxValuesToKeep; + } + + public int size() { + return values.size(); + } + + public boolean isEmpty() { + return values.isEmpty(); + } + + public T getMedian() { + if ( isEmpty() ) + throw new IllegalStateException("Cannot get median value from empty array"); + return getMedian(null); // note that value null will never be used + } + + /** + * Returns the floor(n + 1 / 2) item from the list of values if the list + * has values, or defaultValue is the list is empty. + */ + public T getMedian(final T defaultValue) { + if ( isEmpty() ) + return defaultValue; + + if ( ! sorted ) { + sorted = true; + Collections.sort(values); + } + + final int offset = (int)Math.floor((values.size() + 1) * 0.5) - 1; + return values.get(offset); + } + + public boolean add(final T value) { + if ( ! isFull() ) { + sorted = false; + return values.add(value); + } + else + return false; + } +} diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index 626b91cbf..ac3a970f9 100755 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -141,7 +141,7 @@ public abstract class BaseTest { */ public static class TestDataProvider { private static final Map> tests = new HashMap>(); - private String name; + protected String name; /** * Create a new TestDataProvider instance bound to the class variable C diff --git a/public/java/test/org/broadinstitute/sting/MedianUnitTest.java b/public/java/test/org/broadinstitute/sting/MedianUnitTest.java new file mode 100644 index 000000000..c12db9b77 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/MedianUnitTest.java @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +// our package +package org.broadinstitute.sting; + + +// the imports for unit testing. + + +import org.broadinstitute.sting.gatk.walkers.haplotypecaller.LikelihoodCalculationEngine; +import org.broadinstitute.sting.utils.Median; +import org.testng.Assert; +import org.testng.annotations.BeforeSuite; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + + +public class MedianUnitTest extends BaseTest { + LikelihoodCalculationEngine engine; + + @BeforeSuite + public void before() { + engine = new LikelihoodCalculationEngine(0, 0, false); + } + + // -------------------------------------------------------------------------------- + // + // Provider + // + // -------------------------------------------------------------------------------- + + private class MedianTestProvider extends TestDataProvider { + final List values = new ArrayList(); + final int cap; + final Integer expected; + + public MedianTestProvider(int expected, int cap, Integer ... values) { + super(MedianTestProvider.class); + this.expected = expected; + this.cap = cap; + this.values.addAll(Arrays.asList(values)); + this.name = String.format("values=%s expected=%d cap=%d", this.values, expected, cap); + } + } + + @DataProvider(name = "MedianTestProvider") + public Object[][] makeMedianTestProvider() { + new MedianTestProvider(1, 1000, 0, 1, 2); + new MedianTestProvider(1, 1000, 1, 0, 1, 2); + new MedianTestProvider(1, 1000, 0, 1, 2, 3); + new MedianTestProvider(2, 1000, 0, 1, 2, 3, 4); + new MedianTestProvider(2, 1000, 4, 1, 2, 3, 0); + new MedianTestProvider(1, 1000, 1); + new MedianTestProvider(2, 1000, 2); + new MedianTestProvider(1, 1000, 1, 2); + + new MedianTestProvider(1, 3, 1); + new MedianTestProvider(1, 3, 1, 2); + new MedianTestProvider(2, 3, 1, 2, 3); + new MedianTestProvider(2, 3, 1, 2, 3, 4); + new MedianTestProvider(2, 3, 1, 2, 3, 4, 5); + + new MedianTestProvider(1, 3, 1); + new MedianTestProvider(1, 3, 1, 2); + new MedianTestProvider(2, 3, 3, 2, 1); + new MedianTestProvider(3, 3, 4, 3, 2, 1); + new MedianTestProvider(4, 3, 5, 4, 3, 2, 1); + + return MedianTestProvider.getTests(MedianTestProvider.class); + } + + @Test(dataProvider = "MedianTestProvider") + public void testBasicLikelihoods(MedianTestProvider cfg) { + final Median median = new Median(cfg.cap); + + int nAdded = 0; + for ( final int value : cfg.values ) + if ( median.add(value) ) + nAdded++; + + Assert.assertEquals(nAdded, median.size()); + + Assert.assertEquals(cfg.values.isEmpty(), median.isEmpty()); + Assert.assertEquals(cfg.values.size() >= cfg.cap, median.isFull()); + Assert.assertEquals(median.getMedian(), cfg.expected, cfg.toString()); + } + + @Test(expectedExceptions = IllegalStateException.class) + public void testEmptyMedian() { + final Median median = new Median(); + Assert.assertTrue(median.isEmpty()); + final Integer d = 100; + Assert.assertEquals(median.getMedian(d), d); + median.getMedian(); + } + +} \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java new file mode 100644 index 000000000..84f7fa363 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.diagnostics; + +import org.broadinstitute.sting.WalkerTest; +import org.testng.annotations.Test; + +import java.util.Arrays; + +/** + * Tests ReadGroupProperties + */ +public class ReadGroupPropertiesIntegrationTest extends WalkerTest { + @Test + public void basicTest() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T ReadGroupProperties -R " + b37KGReference + " -I " + b37GoodBAM + " -L 20:10,000,000-11,000,000 -o %s", + 1, + Arrays.asList("1795e3157ab23e7e597acec334e29525")); + executeTest("ReadGroupProperties:", spec); + } +} \ No newline at end of file From 29f74b658bde8012abfbc69699693053b5fd0ebd Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Thu, 1 Mar 2012 17:11:05 -0500 Subject: [PATCH 11/32] Unit tests for the context covariate this is simple, but it's the infra-structure to start messing around with the context. --- .../bqsr/ContextCovariateUnitTest.java | 103 ++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java new file mode 100644 index 000000000..aa6a72ef9 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ContextCovariateUnitTest.java @@ -0,0 +1,103 @@ +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.util.BitSet; +import java.util.Random; + +/** + * Short one line description of the walker. + * + *

+ * [Long description of the walker] + *

+ * + * + *

Input

+ *

+ * [Description of the Input] + *

+ * + *

Output

+ *

+ * [Description of the Output] + *

+ * + *

Examples

+ *
+ *    java
+ *      -jar GenomeAnalysisTK.jar
+ *      -T [walker name]
+ *  
+ * + * @author Mauricio Carneiro + * @since 3/1/12 + */ +public class ContextCovariateUnitTest { + ContextCovariate covariate; + RecalibrationArgumentCollection RAC; + Random random; + + @BeforeClass + public void init() { + RAC = new RecalibrationArgumentCollection(); + covariate = new ContextCovariate(); + random = GenomeAnalysisEngine.getRandomGenerator(); + covariate.initialize(RAC); + + } + + @Test(enabled = true) + public void testSimpleContexts() { + byte [] quals = createRandomReadQuals(101); + byte [] bbases = createRandomReadBases(101); + String bases = stringFrom(bbases); + GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bbases, quals, bbases.length + "M"); + CovariateValues values = covariate.getValues(read); + verifyCovariateArray((BitSet []) values.getMismatches(), RAC.MISMATCHES_CONTEXT_SIZE, bases); + verifyCovariateArray((BitSet []) values.getInsertions(), RAC.INSERTIONS_CONTEXT_SIZE, bases); + verifyCovariateArray((BitSet []) values.getDeletions(), RAC.DELETIONS_CONTEXT_SIZE, bases); + } + + private void verifyCovariateArray(BitSet[] values, int contextSize, String bases) { + for (int i=0; i= contextSize) + Assert.assertEquals(MathUtils.dnaFrom(values[i]), bases.substring(i-contextSize, i)); + else + Assert.assertNull(values[i]); + } + } + + private String stringFrom(byte [] array) { + String s = ""; + for (byte value : array) + s += (char) value; + return s; + } + + private byte [] createRandomReadQuals(int length) { + byte [] quals = new byte[length]; + for (int i=0; i Date: Thu, 1 Mar 2012 17:55:28 -0500 Subject: [PATCH 13/32] ugly RG encoding --- .../gatk/walkers/bqsr/ReadGroupCovariate.java | 22 +++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java index 6076a7a20..aecdd3d4b 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadGroupCovariate.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.Arrays; +import java.util.HashMap; /* * Copyright (c) 2009 The Broad Institute @@ -38,6 +39,10 @@ import java.util.Arrays; */ public class ReadGroupCovariate implements RequiredCovariate { + + private final HashMap readGroupLookupTable = new HashMap(); + private final HashMap readGroupReverseLookupTable = new HashMap(); + private short nextId = 0; // Initialize any member variables using the command-line arguments passed to the walkers @Override @@ -48,8 +53,17 @@ public class ReadGroupCovariate implements RequiredCovariate { public CovariateValues getValues(final GATKSAMRecord read) { final int l = read.getReadLength(); final String readGroupId = read.getReadGroup().getReadGroupId(); - String [] readGroups = new String[l]; - Arrays.fill(readGroups, readGroupId); + short shortId; + if (readGroupLookupTable.containsKey(readGroupId)) + shortId = readGroupLookupTable.get(readGroupId); + else { + shortId = nextId; + readGroupLookupTable.put(readGroupId, nextId); + readGroupReverseLookupTable.put(nextId, readGroupId); + nextId++; + } + Short [] readGroups = new Short[l]; + Arrays.fill(readGroups, shortId); return new CovariateValues(readGroups, readGroups, readGroups); } @@ -58,6 +72,10 @@ public class ReadGroupCovariate implements RequiredCovariate { public final Object getValue(final String str) { return str; } + + public final String decodeReadGroup(final short id) { + return readGroupReverseLookupTable.get(id); + } } From 2f334a57c2433aade864d1c28fa08b6fc5f28dec Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 1 Mar 2012 18:43:23 -0500 Subject: [PATCH 14/32] ReadGroupProperties mk2 -- Includes paired end status (T/F) -- Includes count of reads used in calculation -- Includes simple read type (2x76 for example) -- Better handling of insert size, read length when there's no data, or the data isn't paired end by emitting NA not 0 --- .../diagnostics/ReadGroupProperties.java | 107 +++++++++++------- .../ReadGroupPropertiesIntegrationTest.java | 2 +- 2 files changed, 65 insertions(+), 44 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java index 85f587aaf..c192d04e7 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java @@ -40,8 +40,9 @@ import java.util.HashMap; import java.util.Map; /** - * Emits a GATKReport containing read group, sample, library, platform, center, median insert size and - * median read length for each read group in every BAM file. + * Emits a GATKReport containing read group, sample, library, platform, center, paired end status, + * simple read type name (e.g. 2x76) median insert size and median read length for each read group + * in every provided BAM file * * Note that this walker stops when all read groups have been observed at least a few thousand times so that * the median statistics are well determined. It is safe to run it WG and it'll finish in an appropriate @@ -60,23 +61,23 @@ import java.util.Map; * *
  *      ##:GATKReport.v0.2 ReadGroupProperties : Table of read group properties
- *      readgroup  sample   library       platform  center  median.read.length  median.insert.size
- *      20FUK.1    NA12878  Solexa-18483  illumina  BI                     101                 387
- *      20FUK.2    NA12878  Solexa-18484  illumina  BI                     101                 415
- *      20FUK.3    NA12878  Solexa-18483  illumina  BI                     101                 388
- *      20FUK.4    NA12878  Solexa-18484  illumina  BI                     101                 415
- *      20FUK.5    NA12878  Solexa-18483  illumina  BI                     101                 387
- *      20FUK.6    NA12878  Solexa-18484  illumina  BI                     101                 415
- *      20FUK.7    NA12878  Solexa-18483  illumina  BI                     101                 388
- *      20FUK.8    NA12878  Solexa-18484  illumina  BI                     101                 415
- *      20GAV.1    NA12878  Solexa-18483  illumina  BI                     101                 388
- *      20GAV.2    NA12878  Solexa-18484  illumina  BI                     101                 415
- *      20GAV.3    NA12878  Solexa-18483  illumina  BI                     101                 388
- *      20GAV.4    NA12878  Solexa-18484  illumina  BI                     101                 416
- *      20GAV.5    NA12878  Solexa-18483  illumina  BI                     101                 388
- *      20GAV.6    NA12878  Solexa-18484  illumina  BI                     101                 415
- *      20GAV.7    NA12878  Solexa-18483  illumina  BI                     101                 387
- *      20GAV.8    NA12878  Solexa-18484  illumina  BI                     101                 414
+ *      readgroup  sample   library       platform  center  has.any.reads  is.paired.end  n.reads.analyzed  simple.read.type  median.read.length  median.insert.size
+ *      20FUK.1    NA12878  Solexa-18483  illumina  BI      true           true                      10100  2x101                            101                 387
+ *      20FUK.2    NA12878  Solexa-18484  illumina  BI      true           true                      10115  2x101                            101                 415
+ *      20FUK.3    NA12878  Solexa-18483  illumina  BI      true           true                      10090  2x101                            101                 388
+ *      20FUK.4    NA12878  Solexa-18484  illumina  BI      true           true                      10081  2x101                            101                 415
+ *      20FUK.5    NA12878  Solexa-18483  illumina  BI      true           true                      10078  2x101                            101                 387
+ *      20FUK.6    NA12878  Solexa-18484  illumina  BI      true           true                      10072  2x101                            101                 415
+ *      20FUK.7    NA12878  Solexa-18483  illumina  BI      true           true                      10086  2x101                            101                 388
+ *      20FUK.8    NA12878  Solexa-18484  illumina  BI      true           true                      10097  2x101                            101                 415
+ *      20GAV.1    NA12878  Solexa-18483  illumina  BI      true           true                      10135  2x101                            101                 388
+ *      20GAV.2    NA12878  Solexa-18484  illumina  BI      true           true                      10172  2x101                            101                 415
+ *      20GAV.3    NA12878  Solexa-18483  illumina  BI      true           true                      10141  2x101                            101                 388
+ *      20GAV.4    NA12878  Solexa-18484  illumina  BI      true           true                      10251  2x101                            101                 416
+ *      20GAV.5    NA12878  Solexa-18483  illumina  BI      true           true                      10145  2x101                            101                 388
+ *      20GAV.6    NA12878  Solexa-18484  illumina  BI      true           true                      10184  2x101                            101                 415
+ *      20GAV.7    NA12878  Solexa-18483  illumina  BI      true           true                      10174  2x101                            101                 387
+ *      20GAV.8    NA12878  Solexa-18484  illumina  BI      true           true                      10141  2x101                            101                 414
  *      
*

* @@ -103,14 +104,22 @@ public class ReadGroupProperties extends ReadWalker { public int MAX_VALUES_FOR_MEDIAN = 10000; private final static String TABLE_NAME = "ReadGroupProperties"; - private final Map> readLengths = new HashMap>(); - private final Map> insertSizes = new HashMap>(); + private final Map readGroupInfo = new HashMap(); + + private class PerReadGroupInfo { + public final Median readLength = new Median(MAX_VALUES_FOR_MEDIAN); + public final Median insertSize = new Median(MAX_VALUES_FOR_MEDIAN); + public int nReadsSeen = 0, nReadsPaired = 0; + + public boolean needsMoreData() { + return ! readLength.isFull() || ! insertSize.isFull(); + } + } @Override public void initialize() { for ( final SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { - readLengths.put(rg.getId(), new Median(MAX_VALUES_FOR_MEDIAN)); - insertSizes.put(rg.getId(), new Median(MAX_VALUES_FOR_MEDIAN)); + readGroupInfo.put(rg.getId(), new PerReadGroupInfo()); } } @@ -121,31 +130,28 @@ public class ReadGroupProperties extends ReadWalker { @Override public boolean isDone() { - // TODO -- this is far too slow! - return ! (anyMedianNeedsData(readLengths) || anyMedianNeedsData(insertSizes)); - } - - private final boolean anyMedianNeedsData(Map> medianMap) { - for ( Median median : medianMap.values() ) { - if ( ! median.isFull() ) - return true; + for ( PerReadGroupInfo info : readGroupInfo.values() ) { + if ( info.needsMoreData() ) + return false; } - return false; - } - - private final void updateMedian(final Median median, final int value) { - median.add(value); + return true; } @Override public Integer map(ReferenceContext referenceContext, GATKSAMRecord read, ReadMetaDataTracker readMetaDataTracker) { - final String rg = read.getReadGroup().getId(); + final String rgID = read.getReadGroup().getId(); + final PerReadGroupInfo info = readGroupInfo.get(rgID); - updateMedian(readLengths.get(rg), read.getReadLength()); - if ( read.getReadPairedFlag() && read.getInferredInsertSize() != 0) { - //logger.info(rg + " => " + Math.abs(read.getInferredInsertSize())); - updateMedian(insertSizes.get(rg), Math.abs(read.getInferredInsertSize())); + if ( info.needsMoreData() ) { + info.readLength.add(read.getReadLength()); + info.nReadsSeen++; + if ( read.getReadPairedFlag() ) { + info.nReadsPaired++; + if ( read.getInferredInsertSize() != 0) { + info.insertSize.add(Math.abs(read.getInferredInsertSize())); + } + } } return null; @@ -174,17 +180,32 @@ public class ReadGroupProperties extends ReadWalker { table.addColumn("library", "NA"); table.addColumn("platform", "NA"); table.addColumn("center", "NA"); + table.addColumn("has.any.reads", "false"); + table.addColumn("is.paired.end", "false"); + table.addColumn("n.reads.analyzed", "NA"); + table.addColumn("simple.read.type", "NA"); table.addColumn("median.read.length", Integer.valueOf(0)); table.addColumn("median.insert.size", Integer.valueOf(0)); for ( final SAMReadGroupRecord rg : getToolkit().getSAMFileHeader().getReadGroups() ) { final String rgID = rg.getId(); + PerReadGroupInfo info = readGroupInfo.get(rgID); + + // we are paired if > 25% of reads are paired + final boolean isPaired = info.nReadsPaired / (1.0 * (info.nReadsSeen+1)) > 0.25; + final boolean hasAnyReads = info.nReadsSeen > 0; + final int readLength = info.readLength.getMedian(0); + table.set(rgID, "sample", rg.getSample()); table.set(rgID, "library", rg.getLibrary()); table.set(rgID, "platform", rg.getPlatform()); table.set(rgID, "center", rg.getSequencingCenter()); - table.set(rgID, "median.read.length", readLengths.get(rgID).getMedian(0)); - table.set(rgID, "median.insert.size", insertSizes.get(rgID).getMedian(0)); + table.set(rgID, "has.any.reads", hasAnyReads); + table.set(rgID, "is.paired.end", isPaired); + table.set(rgID, "n.reads.analyzed", info.nReadsSeen); + table.set(rgID, "simple.read.type", hasAnyReads ? String.format("%dx%d", isPaired ? 2 : 1, readLength) : "NA"); + table.set(rgID, "median.read.length", hasAnyReads ? readLength : "NA" ); + table.set(rgID, "median.insert.size", hasAnyReads && isPaired ? info.insertSize.getMedian(0) : "NA" ); } report.print(out); diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java index 84f7fa363..04c30a8fe 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java @@ -38,7 +38,7 @@ public class ReadGroupPropertiesIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T ReadGroupProperties -R " + b37KGReference + " -I " + b37GoodBAM + " -L 20:10,000,000-11,000,000 -o %s", 1, - Arrays.asList("1795e3157ab23e7e597acec334e29525")); + Arrays.asList("8e4d09665c0b65c971bd5dead24f95fe")); executeTest("ReadGroupProperties:", spec); } } \ No newline at end of file From 0ad7d5fbc1129569c7a9ce633e0bd8eb9fb5c072 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Thu, 1 Mar 2012 22:41:13 -0500 Subject: [PATCH 15/32] Standalone common Pair HMM utility class with associated unit tests. --- .../broadinstitute/sting/utils/collections/NestedHashMap.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java b/public/java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java index d280ac804..8652d3c28 100755 --- a/public/java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java +++ b/public/java/src/org/broadinstitute/sting/utils/collections/NestedHashMap.java @@ -34,7 +34,7 @@ import java.util.Map; * Date: Dec 29, 2009 */ -public class NestedHashMap{ +public class NestedHashMap { public final Map data = new HashMap(); From 0a7137616c6cddaa39cac62f71fc276dcdef36f7 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 2 Mar 2012 09:11:59 -0500 Subject: [PATCH 16/32] Now converts gatkreports to properly typed R data types in gsa.read.gatkreport -- use the general function type.convert from read.table to automagically convert the string data to booleans, factors, and numeric types as appropriate. Vastly better than the previous behavior which only worked for numerics, in some cases. --- .../sting/utils/R/gsalib/R/gsa.read.gatkreport.R | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R index 46bbf7eda..876cf5cbc 100644 --- a/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R +++ b/public/R/src/org/broadinstitute/sting/utils/R/gsalib/R/gsa.read.gatkreport.R @@ -4,11 +4,9 @@ colnames(d) = tableHeader; for (i in 1:ncol(d)) { - v = suppressWarnings(as.numeric(d[,i])); - - if (length(na.omit(as.numeric(v))) == length(d[,i])) { - d[,i] = v; - } + # use the general type.convert infrastructure of read.table to convert column data to R types + v = type.convert(d[,i]) + d[,i] = v; } usedNames = ls(envir=tableEnv, pattern=tableName); From 1e07e97b589eb73cebe5052fb656b3c54295b618 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 2 Mar 2012 13:30:17 -0500 Subject: [PATCH 17/32] Optimization: create allele list just once, not for each genotype --- .../sting/gatk/walkers/annotator/RankSumTest.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java index 3f555f780..00968943d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/RankSumTest.java @@ -43,15 +43,15 @@ public abstract class RankSumTest extends InfoFieldAnnotation implements Standar final ArrayList altQuals = new ArrayList(); if ( vc.isSNP() ) { + final List altAlleles = new ArrayList(); + for ( final Allele a : vc.getAlternateAlleles() ) + altAlleles.add(a.getBases()[0]); + for ( final Genotype genotype : genotypes.iterateInSampleNameOrder() ) { final AlignmentContext context = stratifiedContexts.get(genotype.getSampleName()); if ( context == null ) continue; - final List altAlleles = new ArrayList(); - for ( final Allele a : vc.getAlternateAlleles() ) - altAlleles.add(a.getBases()[0]); - fillQualsFromPileup(ref.getBase(), altAlleles, context.getBasePileup(), refQuals, altQuals); } } else if ( vc.isIndel() || vc.isMixed() ) { From ba71b0aee4267be72ed32eaaa0b89cc6434f1a1a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 2 Mar 2012 14:51:23 -0500 Subject: [PATCH 19/32] ReadGroupProperties mk3 -- Includes sequencing date --- .../diagnostics/ReadGroupProperties.java | 44 ++++++++++--------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java index c192d04e7..2eb3b5e85 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java @@ -36,13 +36,14 @@ import org.broadinstitute.sting.utils.Median; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.io.PrintStream; +import java.text.DateFormat; import java.util.HashMap; import java.util.Map; /** - * Emits a GATKReport containing read group, sample, library, platform, center, paired end status, - * simple read type name (e.g. 2x76) median insert size and median read length for each read group - * in every provided BAM file + * Emits a GATKReport containing read group, sample, library, platform, center, sequencing data, + * paired end status, simple read type name (e.g. 2x76) median insert size and median read length + * for each read group in every provided BAM file * * Note that this walker stops when all read groups have been observed at least a few thousand times so that * the median statistics are well determined. It is safe to run it WG and it'll finish in an appropriate @@ -61,23 +62,23 @@ import java.util.Map; * *
  *      ##:GATKReport.v0.2 ReadGroupProperties : Table of read group properties
- *      readgroup  sample   library       platform  center  has.any.reads  is.paired.end  n.reads.analyzed  simple.read.type  median.read.length  median.insert.size
- *      20FUK.1    NA12878  Solexa-18483  illumina  BI      true           true                      10100  2x101                            101                 387
- *      20FUK.2    NA12878  Solexa-18484  illumina  BI      true           true                      10115  2x101                            101                 415
- *      20FUK.3    NA12878  Solexa-18483  illumina  BI      true           true                      10090  2x101                            101                 388
- *      20FUK.4    NA12878  Solexa-18484  illumina  BI      true           true                      10081  2x101                            101                 415
- *      20FUK.5    NA12878  Solexa-18483  illumina  BI      true           true                      10078  2x101                            101                 387
- *      20FUK.6    NA12878  Solexa-18484  illumina  BI      true           true                      10072  2x101                            101                 415
- *      20FUK.7    NA12878  Solexa-18483  illumina  BI      true           true                      10086  2x101                            101                 388
- *      20FUK.8    NA12878  Solexa-18484  illumina  BI      true           true                      10097  2x101                            101                 415
- *      20GAV.1    NA12878  Solexa-18483  illumina  BI      true           true                      10135  2x101                            101                 388
- *      20GAV.2    NA12878  Solexa-18484  illumina  BI      true           true                      10172  2x101                            101                 415
- *      20GAV.3    NA12878  Solexa-18483  illumina  BI      true           true                      10141  2x101                            101                 388
- *      20GAV.4    NA12878  Solexa-18484  illumina  BI      true           true                      10251  2x101                            101                 416
- *      20GAV.5    NA12878  Solexa-18483  illumina  BI      true           true                      10145  2x101                            101                 388
- *      20GAV.6    NA12878  Solexa-18484  illumina  BI      true           true                      10184  2x101                            101                 415
- *      20GAV.7    NA12878  Solexa-18483  illumina  BI      true           true                      10174  2x101                            101                 387
- *      20GAV.8    NA12878  Solexa-18484  illumina  BI      true           true                      10141  2x101                            101                 414
+ *      readgroup  sample   library       platform  center  date     has.any.reads  is.paired.end  n.reads.analyzed  simple.read.type  median.read.length  median.insert.size
+ *      20FUK.1    NA12878  Solexa-18483  illumina  BI      2/2/10   true           true                        498  2x101                            101                 386
+ *      20FUK.2    NA12878  Solexa-18484  illumina  BI      2/2/10   true           true                        476  2x101                            101                 417
+ *      20FUK.3    NA12878  Solexa-18483  illumina  BI      2/2/10   true           true                        407  2x101                            101                 387
+ *      20FUK.4    NA12878  Solexa-18484  illumina  BI      2/2/10   true           true                        389  2x101                            101                 415
+ *      20FUK.5    NA12878  Solexa-18483  illumina  BI      2/2/10   true           true                        433  2x101                            101                 386
+ *      20FUK.6    NA12878  Solexa-18484  illumina  BI      2/2/10   true           true                        480  2x101                            101                 418
+ *      20FUK.7    NA12878  Solexa-18483  illumina  BI      2/2/10   true           true                        450  2x101                            101                 386
+ *      20FUK.8    NA12878  Solexa-18484  illumina  BI      2/2/10   true           true                        438  2x101                            101                 418
+ *      20GAV.1    NA12878  Solexa-18483  illumina  BI      1/26/10  true           true                        490  2x101                            101                 391
+ *      20GAV.2    NA12878  Solexa-18484  illumina  BI      1/26/10  true           true                        485  2x101                            101                 417
+ *      20GAV.3    NA12878  Solexa-18483  illumina  BI      1/26/10  true           true                        460  2x101                            101                 392
+ *      20GAV.4    NA12878  Solexa-18484  illumina  BI      1/26/10  true           true                        434  2x101                            101                 415
+ *      20GAV.5    NA12878  Solexa-18483  illumina  BI      1/26/10  true           true                        479  2x101                            101                 389
+ *      20GAV.6    NA12878  Solexa-18484  illumina  BI      1/26/10  true           true                        461  2x101                            101                 416
+ *      20GAV.7    NA12878  Solexa-18483  illumina  BI      1/26/10  true           true                        509  2x101                            101                 386
+ *      20GAV.8    NA12878  Solexa-18484  illumina  BI      1/26/10  true           true                        476  2x101                            101                 410                           101                 414
  *      
*

* @@ -172,6 +173,7 @@ public class ReadGroupProperties extends ReadWalker { final GATKReport report = new GATKReport(); report.addTable(TABLE_NAME, "Table of read group properties"); GATKReportTable table = report.getTable(TABLE_NAME); + DateFormat dateFormatter = DateFormat.getDateInstance(DateFormat.SHORT); table.addPrimaryKey("readgroup"); //* Emits a GATKReport containing read group, sample, library, platform, center, median insert size and @@ -180,6 +182,7 @@ public class ReadGroupProperties extends ReadWalker { table.addColumn("library", "NA"); table.addColumn("platform", "NA"); table.addColumn("center", "NA"); + table.addColumn("date", "NA"); table.addColumn("has.any.reads", "false"); table.addColumn("is.paired.end", "false"); table.addColumn("n.reads.analyzed", "NA"); @@ -200,6 +203,7 @@ public class ReadGroupProperties extends ReadWalker { table.set(rgID, "library", rg.getLibrary()); table.set(rgID, "platform", rg.getPlatform()); table.set(rgID, "center", rg.getSequencingCenter()); + table.set(rgID, "date", dateFormatter.format(rg.getRunDate())); table.set(rgID, "has.any.reads", hasAnyReads); table.set(rgID, "is.paired.end", isPaired); table.set(rgID, "n.reads.analyzed", info.nReadsSeen); From 69611af7d303246b1087c5081aaeada66fbaded5 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 2 Mar 2012 18:53:45 -0500 Subject: [PATCH 21/32] Workaround for bug in Picard in ReadGroupProperties -- NPE caused when you call getRunDate on a read group without a date. --- .../diagnostics/ReadGroupProperties.java | 31 ++++++++++++------- .../ReadGroupPropertiesIntegrationTest.java | 2 +- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java index 2eb3b5e85..d7a48d321 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupProperties.java @@ -199,19 +199,28 @@ public class ReadGroupProperties extends ReadWalker { final boolean hasAnyReads = info.nReadsSeen > 0; final int readLength = info.readLength.getMedian(0); - table.set(rgID, "sample", rg.getSample()); - table.set(rgID, "library", rg.getLibrary()); - table.set(rgID, "platform", rg.getPlatform()); - table.set(rgID, "center", rg.getSequencingCenter()); - table.set(rgID, "date", dateFormatter.format(rg.getRunDate())); - table.set(rgID, "has.any.reads", hasAnyReads); - table.set(rgID, "is.paired.end", isPaired); - table.set(rgID, "n.reads.analyzed", info.nReadsSeen); - table.set(rgID, "simple.read.type", hasAnyReads ? String.format("%dx%d", isPaired ? 2 : 1, readLength) : "NA"); - table.set(rgID, "median.read.length", hasAnyReads ? readLength : "NA" ); - table.set(rgID, "median.insert.size", hasAnyReads && isPaired ? info.insertSize.getMedian(0) : "NA" ); + setTableValue(table, rgID, "sample", rg.getSample()); + setTableValue(table, rgID, "library", rg.getLibrary()); + setTableValue(table, rgID, "platform", rg.getPlatform()); + setTableValue(table, rgID, "center", rg.getSequencingCenter()); + try { + setTableValue(table, rgID, "date", rg.getRunDate() != null ? dateFormatter.format(rg.getRunDate()) : "NA"); + } catch ( NullPointerException e ) { + // TODO: remove me when bug in Picard is fixed that causes NPE when date isn't present + setTableValue(table, rgID, "date", "NA"); + } + setTableValue(table, rgID, "has.any.reads", hasAnyReads); + setTableValue(table, rgID, "is.paired.end", isPaired); + setTableValue(table, rgID, "n.reads.analyzed", info.nReadsSeen); + setTableValue(table, rgID, "simple.read.type", hasAnyReads ? String.format("%dx%d", isPaired ? 2 : 1, readLength) : "NA"); + setTableValue(table, rgID, "median.read.length", hasAnyReads ? readLength : "NA" ); + setTableValue(table, rgID, "median.insert.size", hasAnyReads && isPaired ? info.insertSize.getMedian(0) : "NA" ); } report.print(out); } + + private final void setTableValue(GATKReportTable table, final String rgID, final String key, final Object value) { + table.set(rgID, key, value == null ? "NA" : value); + } } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java index 04c30a8fe..1a4c8db30 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/diagnostics/ReadGroupPropertiesIntegrationTest.java @@ -38,7 +38,7 @@ public class ReadGroupPropertiesIntegrationTest extends WalkerTest { WalkerTestSpec spec = new WalkerTestSpec( "-T ReadGroupProperties -R " + b37KGReference + " -I " + b37GoodBAM + " -L 20:10,000,000-11,000,000 -o %s", 1, - Arrays.asList("8e4d09665c0b65c971bd5dead24f95fe")); + Arrays.asList("6b8cce223af28cbadcfe87a3b841fc56")); executeTest("ReadGroupProperties:", spec); } } \ No newline at end of file From 3b5a7c34d74518739ca588be62e21f591d053aa1 Mon Sep 17 00:00:00 2001 From: Guillermo del Angel Date: Sun, 4 Mar 2012 10:24:29 -0500 Subject: [PATCH 22/32] Added argument to ValidationAmplicons to only output valid sequences - useful for not having to post-filter or grep resulting files before delivering downstream --- .../validation/ValidationAmplicons.java | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java index b27bef265..e812fb53a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/validation/ValidationAmplicons.java @@ -110,6 +110,13 @@ public class ValidationAmplicons extends RodWalker { @Argument(doc="Lower case SNPs rather than replacing with 'N'",fullName="lowerCaseSNPs",required=false) boolean lowerCaseSNPs = false; + /** + * If onlyOutputValidAmplicons is true, the output fasta file will contain only valid sequences. + * Useful for producing delivery-ready files. + */ + @Argument(doc="Only output valid sequences.",fullName="onlyOutputValidAmplicons",required=false) + boolean onlyOutputValidAmplicons = false; + /** * BWA single-end alignment is used as a primer specificity proxy. Low-complexity regions (that don't align back to themselves as a best hit) are lowercased. * This changes the size of the k-mer used for alignment. @@ -486,14 +493,16 @@ public class ValidationAmplicons extends RodWalker { valid = "Valid"; } - String seqIdentity = sequence.toString().replace('n', 'N').replace('i', 'I').replace('d', 'D'); - if (!sequenomOutput) - out.printf(">%s %s %s%n%s%n", allelePos != null ? allelePos.toString() : "multiple", valid, probeName, seqIdentity); - else { - seqIdentity = seqIdentity.replace("*",""); // identifier < 20 letters long, no * in ref allele, one line per record - probeName = probeName.replace("amplicon_","a"); - out.printf("%s_%s %s%n", allelePos != null ? allelePos.toString() : "multiple", probeName, seqIdentity); + if (!onlyOutputValidAmplicons || !sequenceInvalid) { + String seqIdentity = sequence.toString().replace('n', 'N').replace('i', 'I').replace('d', 'D'); + if (!sequenomOutput) + out.printf(">%s %s %s%n%s%n", allelePos != null ? allelePos.toString() : "multiple", valid, probeName, seqIdentity); + else { + seqIdentity = seqIdentity.replace("*",""); // identifier < 20 letters long, no * in ref allele, one line per record + probeName = probeName.replace("amplicon_","a"); + out.printf("%s_%s %s%n", allelePos != null ? allelePos.toString() : "multiple", probeName, seqIdentity); + } } } } From d6871967aea416f87b26dee1ce173155214e029d Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Mon, 5 Mar 2012 08:28:42 -0500 Subject: [PATCH 23/32] Adding more unit tests and contracts to PairHMM util class. Updating HaplotypeCaller to use the new PairHMM util class. Now that the HMM result isn't dependent on the length of the haplotype there is no reason to ensure all haplotypes have the save length which simplifies the code considerably. --- .../broadinstitute/sting/utils/Haplotype.java | 36 +++++++++---------- .../broadinstitute/sting/MedianUnitTest.java | 7 ---- 2 files changed, 16 insertions(+), 27 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index def2fc349..41b73d157 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -113,24 +113,27 @@ public class Haplotype { return isReference; } - public byte[] insertAllele( final Allele refAllele, final Allele altAllele, int refInsertLocation, final byte[] paddedRef, final int refStart, - final Cigar haplotypeCigar, final int numBasesAddedToStartOfHaplotype, final int refHaplotypeLength ) { + public byte[] insertAllele( final Allele refAllele, final Allele altAllele, int refInsertLocation, final int hapStart, final Cigar haplotypeCigar ) { if( refAllele.length() != altAllele.length() ) { refInsertLocation++; } - int haplotypeInsertLocation = getHaplotypeCoordinateForReferenceCoordinate(refStart + numBasesAddedToStartOfHaplotype, haplotypeCigar, refInsertLocation); + int haplotypeInsertLocation = getHaplotypeCoordinateForReferenceCoordinate(hapStart, haplotypeCigar, refInsertLocation); if( haplotypeInsertLocation == -1 ) { // desired change falls inside deletion so don't bother creating a new haplotype - return getBases().clone(); + return bases.clone(); } - haplotypeInsertLocation += numBasesAddedToStartOfHaplotype; - final byte[] newHaplotype = getBases().clone(); + byte[] newHaplotype; try { if( refAllele.length() == altAllele.length() ) { // SNP or MNP + newHaplotype = bases.clone(); for( int iii = 0; iii < altAllele.length(); iii++ ) { newHaplotype[haplotypeInsertLocation+iii] = altAllele.getBases()[iii]; } - } else if( refAllele.length() < altAllele.length() ) { // insertion + } else if( refAllele.length() < altAllele.length() ) { // insertion final int altAlleleLength = altAllele.length(); + newHaplotype = new byte[bases.length + altAlleleLength]; + for( int iii = 0; iii < bases.length; iii++ ) { + newHaplotype[iii] = bases[iii]; + } for( int iii = newHaplotype.length - 1; iii > haplotypeInsertLocation + altAlleleLength - 1; iii-- ) { newHaplotype[iii] = newHaplotype[iii-altAlleleLength]; } @@ -138,24 +141,17 @@ public class Haplotype { newHaplotype[haplotypeInsertLocation+iii] = altAllele.getBases()[iii]; } } else { // deletion - int refHaplotypeOffset = 0; - for( final CigarElement ce : haplotypeCigar.getCigarElements()) { - if(ce.getOperator() == CigarOperator.D) { refHaplotypeOffset += ce.getLength(); } - else if(ce.getOperator() == CigarOperator.I) { refHaplotypeOffset -= ce.getLength(); } - } - for( int iii = 0; iii < altAllele.length(); iii++ ) { - newHaplotype[haplotypeInsertLocation+iii] = altAllele.getBases()[iii]; - } final int shift = refAllele.length() - altAllele.length(); - for( int iii = haplotypeInsertLocation + altAllele.length(); iii < newHaplotype.length - shift; iii++ ) { - newHaplotype[iii] = newHaplotype[iii+shift]; + newHaplotype = new byte[bases.length - shift]; + for( int iii = 0; iii < haplotypeInsertLocation + altAllele.length(); iii++ ) { + newHaplotype[iii] = bases[iii]; } - for( int iii = 0; iii < shift; iii++ ) { - newHaplotype[iii+newHaplotype.length-shift] = paddedRef[refStart+refHaplotypeLength+refHaplotypeOffset+iii]; + for( int iii = haplotypeInsertLocation + altAllele.length(); iii < newHaplotype.length; iii++ ) { + newHaplotype[iii] = bases[iii+shift]; } } } catch (Exception e) { // event already on haplotype is too large/complex to insert another allele, most likely because of not enough reference padding - return getBases().clone(); + return bases.clone(); } return newHaplotype; diff --git a/public/java/test/org/broadinstitute/sting/MedianUnitTest.java b/public/java/test/org/broadinstitute/sting/MedianUnitTest.java index c12db9b77..db89aee78 100644 --- a/public/java/test/org/broadinstitute/sting/MedianUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/MedianUnitTest.java @@ -29,7 +29,6 @@ package org.broadinstitute.sting; // the imports for unit testing. -import org.broadinstitute.sting.gatk.walkers.haplotypecaller.LikelihoodCalculationEngine; import org.broadinstitute.sting.utils.Median; import org.testng.Assert; import org.testng.annotations.BeforeSuite; @@ -42,12 +41,6 @@ import java.util.List; public class MedianUnitTest extends BaseTest { - LikelihoodCalculationEngine engine; - - @BeforeSuite - public void before() { - engine = new LikelihoodCalculationEngine(0, 0, false); - } // -------------------------------------------------------------------------------- // From e9ad382e749773827d78f90a9c5d6595c77aca72 Mon Sep 17 00:00:00 2001 From: Mauricio Carneiro Date: Sat, 3 Mar 2012 18:01:55 -0500 Subject: [PATCH 26/32] unifying the BQSR argument collection --- .../bqsr/RecalibrationArgumentCollection.java | 67 ++++++++++++++++++- 1 file changed, 65 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java index 38e7051e4..cc6f67cc9 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java @@ -25,8 +25,14 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; -import org.broadinstitute.sting.commandline.Argument; -import org.broadinstitute.sting.commandline.Hidden; +import org.broad.tribble.Feature; +import org.broadinstitute.sting.commandline.*; +import org.broadinstitute.sting.gatk.walkers.recalibration.CountCovariatesGatherer; + +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; /** * Created by IntelliJ IDEA. @@ -39,6 +45,63 @@ import org.broadinstitute.sting.commandline.Hidden; public class RecalibrationArgumentCollection { + /** + * This algorithm treats every reference mismatch as an indication of error. However, real genetic variation is expected to mismatch the reference, + * so it is critical that a database of known polymorphic sites is given to the tool in order to skip over those sites. This tool accepts any number of RodBindings (VCF, Bed, etc.) + * for use as this database. For users wishing to exclude an interval list of known variation simply use -XL my.interval.list to skip over processing those sites. + * Please note however that the statistics reported by the tool will not accurately reflected those sites skipped by the -XL argument. + */ + @Input(fullName = "knownSites", shortName = "knownSites", doc = "A database of known polymorphic sites to skip over in the recalibration algorithm", required = false) + protected List> knownSites = Collections.emptyList(); + + /** + * After the header, data records occur one per line until the end of the file. The first several items on a line are the + * values of the individual covariates and will change depending on which covariates were specified at runtime. The last + * three items are the data- that is, number of observations for this combination of covariates, number of reference mismatches, + * and the raw empirical quality score calculated by phred-scaling the mismatch rate. + */ + @Gather(CountCovariatesGatherer.class) + @Output + protected PrintStream RECAL_FILE; + + /** + * List all implemented covariates. + */ + @Argument(fullName = "list", shortName = "ls", doc = "List the available covariates and exit", required = false) + protected boolean LIST_ONLY = false; + + /** + * Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you. See the list of covariates with -list. + */ + @Argument(fullName = "covariate", shortName = "cov", doc = "Covariates to be used in the recalibration. Each covariate is given as a separate cov parameter. ReadGroup and ReportedQuality are required covariates and are already added for you.", required = false) + protected String[] COVARIATES = null; + + /* + * Use the standard set of covariates in addition to the ones listed using the -cov argument + */ + @Argument(fullName = "standard_covs", shortName = "standard", doc = "Use the standard set of covariates in addition to the ones listed using the -cov argument", required = false) + protected boolean USE_STANDARD_COVARIATES = true; + + ///////////////////////////// + // Debugging-only Arguments + ///////////////////////////// + /** + * This calculation is critically dependent on being able to skip over known polymorphic sites. Please be sure that you know what you are doing if you use this option. + */ + @Hidden + @Argument(fullName = "run_without_dbsnp_potentially_ruining_quality", shortName = "run_without_dbsnp_potentially_ruining_quality", required = false, doc = "If specified, allows the recalibrator to be used without a dbsnp rod. Very unsafe and for expert users only.") + protected boolean RUN_WITHOUT_DBSNP = false; + + ///////////////////////////// + // protected Member Variables + ///////////////////////////// + protected final RecalDataManager dataManager = new RecalDataManager(); // Holds the data HashMap used to create collapsed data hashmaps (delta delta tables) + protected final ArrayList requestedCovariates = new ArrayList();// A list to hold the covariate objects that were requested + + protected final String SKIP_RECORD_ATTRIBUTE = "SKIP"; // used to label reads that should be skipped. + protected final String SEEN_ATTRIBUTE = "SEEN"; // used to label reads as processed. + + /** * CountCovariates and TableRecalibration accept a --solid_recal_mode flag which governs how the recalibrator handles the * reads which have had the reference inserted because of color space inconsistencies. From 14a77b1e717d699efbc63796d699b5317fd6a2ae Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Mon, 5 Mar 2012 12:28:32 -0500 Subject: [PATCH 27/32] Getting rid of redundant methods in MathUtils. Adding unit tests for approximateLog10SumLog10 and normalizeFromLog10. Increasing the precision of the Jacobian approximation used by approximateLog10SumLog which changes the UG+HC integration tests ever so slightly. --- .../indels/HaplotypeIndelErrorModel.java | 3 +- .../indels/PairHMMIndelErrorModel.java | 17 +-- .../GaussianMixtureModel.java | 11 +- .../broadinstitute/sting/utils/MathUtils.java | 128 ++++-------------- .../UnifiedGenotyperIntegrationTest.java | 32 ++--- .../sting/utils/MathUtilsUnitTest.java | 101 +++++++++++--- 6 files changed, 135 insertions(+), 157 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java index 200a250f2..26023bd2f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/HaplotypeIndelErrorModel.java @@ -454,8 +454,7 @@ public class HaplotypeIndelErrorModel { // Compute log10(10^x1/2 + 10^x2/2) = log10(10^x1+x0^x2)-log10(2) // First term is approximated by Jacobian log with table lookup. // Second term is a constant added to both likelihoods so will be ignored - haplotypeLikehoodMatrix[i][j] += MathUtils.softMax(readLikelihood[0], - readLikelihood[1]); + haplotypeLikehoodMatrix[i][j] += MathUtils.approximateLog10SumLog10(readLikelihood[0], readLikelihood[1]); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java index 6410d619d..64993b43a 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java @@ -166,18 +166,17 @@ public class PairHMMIndelErrorModel { final double pBaseRead = (x == y)? baseMatchArray[(int)qual]:baseMismatchArray[(int)qual]; - matchMetricArray[indI][indJ] = MathUtils.softMax(matchMetricArray[im1][jm1] + pBaseRead, XMetricArray[im1][jm1] + pBaseRead, - YMetricArray[im1][jm1] + pBaseRead); + matchMetricArray[indI][indJ] = pBaseRead + MathUtils.approximateLog10SumLog10(new double[]{matchMetricArray[im1][jm1], XMetricArray[im1][jm1], YMetricArray[im1][jm1]}); final double c1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : currentGOP[jm1]; final double d1 = indJ == Y_METRIC_LENGTH-1 ? END_GAP_COST : currentGCP[jm1]; - XMetricArray[indI][indJ] = MathUtils.softMax(matchMetricArray[im1][indJ] + c1, XMetricArray[im1][indJ] + d1); + XMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10(matchMetricArray[im1][indJ] + c1, XMetricArray[im1][indJ] + d1); // update Y array final double c2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : currentGOP[jm1]; final double d2 = indI == X_METRIC_LENGTH-1 ? END_GAP_COST : currentGCP[jm1]; - YMetricArray[indI][indJ] = MathUtils.softMax(matchMetricArray[indI][jm1] + c2, YMetricArray[indI][jm1] + d2); + YMetricArray[indI][indJ] = MathUtils.approximateLog10SumLog10(matchMetricArray[indI][jm1] + c2, YMetricArray[indI][jm1] + d2); } } @@ -316,9 +315,7 @@ public class PairHMMIndelErrorModel { final int bestI = X_METRIC_LENGTH - 1, bestJ = Y_METRIC_LENGTH - 1; - final double bestMetric = MathUtils.softMax(matchMetricArray[bestI][bestJ], - XMetricArray[bestI][bestJ], - YMetricArray[bestI][bestJ]); + final double bestMetric = MathUtils.approximateLog10SumLog10(new double[]{ matchMetricArray[bestI][bestJ], XMetricArray[bestI][bestJ], YMetricArray[bestI][bestJ] }); /* if (DEBUG) { @@ -651,7 +648,7 @@ public class PairHMMIndelErrorModel { private final static double[] getHaplotypeLikelihoods(final int numHaplotypes, final int readCounts[], final double readLikelihoods[][]) { final double[][] haplotypeLikehoodMatrix = new double[numHaplotypes][numHaplotypes]; - // todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplied to just a single loop without the intermediate NxN matrix + // todo: MAD 09/26/11 -- I'm almost certain this calculation can be simplified to just a single loop without the intermediate NxN matrix for (int i=0; i < numHaplotypes; i++) { for (int j=i; j < numHaplotypes; j++){ // combine likelihoods of haplotypeLikelihoods[i], haplotypeLikelihoods[j] @@ -665,7 +662,7 @@ public class PairHMMIndelErrorModel { final double li = readLikelihoods[readIdx][i]; final double lj = readLikelihoods[readIdx][j]; final int readCount = readCounts[readIdx]; - haplotypeLikehoodMatrix[i][j] += readCount * (MathUtils.softMax(li, lj) + LOG_ONE_HALF); + haplotypeLikehoodMatrix[i][j] += readCount * (MathUtils.approximateLog10SumLog10(li, lj) + LOG_ONE_HALF); } } } @@ -678,7 +675,7 @@ public class PairHMMIndelErrorModel { } } - // renormalize so that max element is zero. + // renormalize so that max element is zero. return MathUtils.normalizeFromLog10(genotypeLikelihoods, false, true); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java index 82776ca2e..6f0ae7c25 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/GaussianMixtureModel.java @@ -140,15 +140,16 @@ public class GaussianMixtureModel { } for( final VariantDatum datum : data ) { - final ArrayList pVarInGaussianLog10 = new ArrayList( gaussians.size() ); + final double[] pVarInGaussianLog10 = new double[gaussians.size()]; + int gaussianIndex = 0; for( final MultivariateGaussian gaussian : gaussians ) { final double pVarLog10 = gaussian.evaluateDatumLog10( datum ); - pVarInGaussianLog10.add( pVarLog10 ); + pVarInGaussianLog10[gaussianIndex++] = pVarLog10; } - final double[] pVarInGaussianNormalized = MathUtils.normalizeFromLog10( pVarInGaussianLog10 ); - int iii = 0; + final double[] pVarInGaussianNormalized = MathUtils.normalizeFromLog10( pVarInGaussianLog10, false ); + gaussianIndex = 0; for( final MultivariateGaussian gaussian : gaussians ) { - gaussian.assignPVarInGaussian( pVarInGaussianNormalized[iii++] ); //BUGBUG: to clean up + gaussian.assignPVarInGaussian( pVarInGaussianNormalized[gaussianIndex++] ); } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java index 6f2db67ee..a96cbffc5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/MathUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/MathUtils.java @@ -41,14 +41,6 @@ import java.util.*; * @author Kiran Garimella */ public class MathUtils { - /** Public constants - used for the Lanczos approximation to the factorial function - * (for the calculation of the binomial/multinomial probability in logspace) - * @param LANC_SEQ[] - an array holding the constants which correspond to the product - * of Chebyshev Polynomial coefficients, and points on the Gamma function (for interpolation) - * [see A Precision Approximation of the Gamma Function J. SIAM Numer. Anal. Ser. B, Vol. 1 1964. pp. 86-96] - * @param LANC_G - a value for the Lanczos approximation to the gamma function that works to - * high precision - */ /** * Private constructor. No instantiating this class! @@ -56,6 +48,28 @@ public class MathUtils { private MathUtils() { } + public static final double[] log10Cache; + private static final double[] jacobianLogTable; + private static final double JACOBIAN_LOG_TABLE_STEP = 0.001; + private static final double MAX_JACOBIAN_TOLERANCE = 10.0; + private static final int JACOBIAN_LOG_TABLE_SIZE = (int) (MAX_JACOBIAN_TOLERANCE / JACOBIAN_LOG_TABLE_STEP) + 1; + private static final int MAXN = 11000; + private static final int LOG10_CACHE_SIZE = 4 * MAXN; // we need to be able to go up to 2*(2N) when calculating some of the coefficients + + static { + log10Cache = new double[LOG10_CACHE_SIZE]; + jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE]; + + log10Cache[0] = Double.NEGATIVE_INFINITY; + for (int k = 1; k < LOG10_CACHE_SIZE; k++) + log10Cache[k] = Math.log10(k); + + for (int k = 0; k < JACOBIAN_LOG_TABLE_SIZE; k++) { + jacobianLogTable[k] = Math.log10(1.0 + Math.pow(10.0, -((double) k) * JACOBIAN_LOG_TABLE_STEP)); + + } + } + // A fast implementation of the Math.round() method. This method does not perform // under/overflow checking, so this shouldn't be used in the general case (but is fine // if one is already make those checks before calling in to the rounding). @@ -536,7 +550,7 @@ public class MathUtils { // all negative) the largest value; also, we need to convert to normal-space. double maxValue = Utils.findMaxEntry(array); - // we may decide to just normalize in log space with converting to linear space + // we may decide to just normalize in log space without converting to linear space if (keepInLogSpace) { for (int i = 0; i < array.length; i++) array[i] -= maxValue; @@ -563,29 +577,6 @@ public class MathUtils { return normalized; } - public static double[] normalizeFromLog10(List array, boolean takeLog10OfOutput) { - double[] normalized = new double[array.size()]; - - // for precision purposes, we need to add (or really subtract, since they're - // all negative) the largest value; also, we need to convert to normal-space. - double maxValue = MathUtils.arrayMaxDouble(array); - for (int i = 0; i < array.size(); i++) - normalized[i] = Math.pow(10, array.get(i) - maxValue); - - // normalize - double sum = 0.0; - for (int i = 0; i < array.size(); i++) - sum += normalized[i]; - for (int i = 0; i < array.size(); i++) { - double x = normalized[i] / sum; - if (takeLog10OfOutput) - x = Math.log10(x); - normalized[i] = x; - } - - return normalized; - } - /** * normalizes the log10-based array. ASSUMES THAT ALL ARRAY ENTRIES ARE <= 0 (<= 1 IN REAL-SPACE). * @@ -596,10 +587,6 @@ public class MathUtils { return normalizeFromLog10(array, false); } - public static double[] normalizeFromLog10(List array) { - return normalizeFromLog10(array, false); - } - public static int maxElementIndex(final double[] array) { return maxElementIndex(array, array.length); } @@ -1207,78 +1194,11 @@ public class MathUtils { return ((double) num) / (Math.max(denom, 1)); } - public static final double[] log10Cache; - public static final double[] jacobianLogTable; - public static final int JACOBIAN_LOG_TABLE_SIZE = 101; - public static final double JACOBIAN_LOG_TABLE_STEP = 0.1; - public static final double INV_JACOBIAN_LOG_TABLE_STEP = 1.0 / JACOBIAN_LOG_TABLE_STEP; - public static final double MAX_JACOBIAN_TOLERANCE = 10.0; - private static final int MAXN = 11000; - private static final int LOG10_CACHE_SIZE = 4 * MAXN; // we need to be able to go up to 2*(2N) when calculating some of the coefficients - - static { - log10Cache = new double[LOG10_CACHE_SIZE]; - jacobianLogTable = new double[JACOBIAN_LOG_TABLE_SIZE]; - - log10Cache[0] = Double.NEGATIVE_INFINITY; - for (int k = 1; k < LOG10_CACHE_SIZE; k++) - log10Cache[k] = Math.log10(k); - - for (int k = 0; k < JACOBIAN_LOG_TABLE_SIZE; k++) { - jacobianLogTable[k] = Math.log10(1.0 + Math.pow(10.0, -((double) k) * JACOBIAN_LOG_TABLE_STEP)); - - } - } - - static public double softMax(final double[] vec) { - double acc = vec[0]; - for (int k = 1; k < vec.length; k++) - acc = softMax(acc, vec[k]); - - return acc; - - } - static public double max(double x0, double x1, double x2) { double a = Math.max(x0, x1); return Math.max(a, x2); } - static public double softMax(final double x0, final double x1, final double x2) { - // compute naively log10(10^x[0] + 10^x[1]+...) - // return Math.log10(MathUtils.sumLog10(vec)); - - // better approximation: do Jacobian logarithm function on data pairs - double a = softMax(x0, x1); - return softMax(a, x2); - } - - static public double softMax(final double x, final double y) { - // we need to compute log10(10^x + 10^y) - // By Jacobian logarithm identity, this is equal to - // max(x,y) + log10(1+10^-abs(x-y)) - // we compute the second term as a table lookup - // with integer quantization - - // slow exact version: - // return Math.log10(Math.pow(10.0,x) + Math.pow(10.0,y)); - - double diff = x - y; - - if (diff > MAX_JACOBIAN_TOLERANCE) - return x; - else if (diff < -MAX_JACOBIAN_TOLERANCE) - return y; - else if (diff >= 0) { - int ind = (int) (diff * INV_JACOBIAN_LOG_TABLE_STEP + 0.5); - return x + jacobianLogTable[ind]; - } - else { - int ind = (int) (-diff * INV_JACOBIAN_LOG_TABLE_STEP + 0.5); - return y + jacobianLogTable[ind]; - } - } - public static double phredScaleToProbability(byte q) { return Math.pow(10, (-q) / 10.0); } @@ -1734,6 +1654,4 @@ public class MathUtils { return bitSetFrom(baseTen+preContext); // the number representing this DNA string is the base_10 representation plus all combinations that preceded this string length. } - - } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index 823eeeeb9..cfb0d11a1 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -28,7 +28,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("202b337ebbea3def1be8495eb363dfa8")); + Arrays.asList("8f81a14fffc1a59b4b066f8595dc1232")); executeTest("test MultiSample Pilot1", spec); } @@ -52,7 +52,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("ae29b9c9aacce8046dc780430540cd62")); + Arrays.asList("c5b53231f4f6d9524bc4ec8115f44f5c")); executeTest("test SingleSample Pilot2", spec); } @@ -60,7 +60,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " -nosl -NO_HEADER -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + validationDataLocation + "multiallelic.snps.bam -o %s -L " + validationDataLocation + "multiallelic.snps.intervals", 1, - Arrays.asList("10027d13befaa07b7900a7af0ae0791c")); + Arrays.asList("5af005255240a2186f04cb50851b8b6f")); executeTest("test Multiple SNP alleles", spec); } @@ -70,7 +70,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "fda341de80b3f6fd42a83352b18b1d65"; + private final static String COMPRESSED_OUTPUT_MD5 = "a08df9aea2b3df09cf90ff8e6e3be3ea"; @Test public void testCompressedOutput() { @@ -91,7 +91,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // Note that we need to turn off any randomization for this to work, so no downsampling and no annotations - String md5 = "32a34362dff51d8b73a3335048516d82"; + String md5 = "6358934c1c26345013a38261b8c45aa4"; WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -dt NONE -G none -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,075,000", 1, @@ -179,8 +179,8 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testHeterozyosity() { HashMap e = new HashMap(); - e.put( 0.01, "2cb2544739e01f6c08fd820112914317" ); - e.put( 1.0 / 1850, "730b2b83a4b1f6d46fc3b5cd7d90756c" ); + e.put( 0.01, "926b58038dd4989bf7eda697a847eea9" ); + e.put( 1.0 / 1850, "93f44105b43b65730a3b821e27b0fa16" ); for ( Map.Entry entry : e.entrySet() ) { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( @@ -204,7 +204,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("f0fbe472f155baf594b1eeb58166edef")); + Arrays.asList("a1b75a7e12b160b0be823228c958573f")); executeTest(String.format("test multiple technologies"), spec); } @@ -223,7 +223,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("8c87c749a7bb5a76ed8504d4ec254272")); + Arrays.asList("3bda1279cd6dcb47885f3e19466f11b9")); executeTest(String.format("test calling with BAQ"), spec); } @@ -242,7 +242,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("bd9d3d50a1f49605d7cd592a0f446899")); + Arrays.asList("d9fc3ba94a0d46029778c7b457e7292a")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -257,7 +257,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("2ad52c2e75b3ffbfd8f03237c444e8e6")); + Arrays.asList("b2e30ae3e5ffa6108f9f6178b1d2e679")); executeTest(String.format("test indel caller in SLX with low min allele count"), spec); } @@ -270,7 +270,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("91cd6d2e3972b0b8e4064bb35a33241f")); + Arrays.asList("2cd182a84613fa91a6020466d2d327e2")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -280,7 +280,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("c60a44ba94a80a0cb1fba8b6f90a13cd")); + Arrays.asList("9cd08dc412a007933381e9c76c073899")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec1); } @@ -290,7 +290,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("320f61c87253aba77d6dc782242cba8b")); + Arrays.asList("5ef1f007d3ef77c1b8f31e5e036eff53")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec2); } @@ -300,7 +300,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec3 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2.20101123.indels.sites.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,080,000", 1, - Arrays.asList("c9897b80615c53a4ea10a4b193d56d9c")); + Arrays.asList("2609675a356f2dfc86f8a1d911210978")); executeTest("test MultiSample Pilot2 indels with complicated records", spec3); } @@ -309,7 +309,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec4 = new WalkerTest.WalkerTestSpec( baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + validationDataLocation + "ALL.wgs.union_v2_chr20_100_110K.20101123.indels.sites.vcf -I " + validationDataLocation + "phase1_GBR_realigned.chr20.100K-110K.bam -o %s -L 20:100,000-110,000", 1, - Arrays.asList("5282fdb1711a532d726c13507bf80a21")); + Arrays.asList("4fdd8da77167881b71b3547da5c13f94")); executeTest("test MultiSample Phase1 indels with complicated records", spec4); } diff --git a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java index 75fc44873..1ba6c74d4 100755 --- a/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/MathUtilsUnitTest.java @@ -205,6 +205,24 @@ public class MathUtilsUnitTest extends BaseTest { } } + /** + * Private functions used by testArrayShuffle() + */ + private boolean hasUniqueElements(Object[] x) { + for (int i = 0; i < x.length; i++) + for (int j = i + 1; j < x.length; j++) + if (x[i].equals(x[j]) || x[i] == x[j]) + return false; + return true; + } + + private boolean hasAllElements(final Object[] expected, final Object[] actual) { + HashSet set = new HashSet(); + set.addAll(Arrays.asList(expected)); + set.removeAll(Arrays.asList(actual)); + return set.isEmpty(); + } + @Test(enabled = true) public void testIntAndBitSetConversion() { Assert.assertEquals(MathUtils.intFrom(MathUtils.bitSetFrom(428)), 428); @@ -234,26 +252,71 @@ public class MathUtilsUnitTest extends BaseTest { Assert.assertEquals(MathUtils.dnaFrom(MathUtils.bitSetFrom("AACGTCAATGCAGTCAAGTCAGACGTGGGTT")), "AACGTCAATGCAGTCAAGTCAGACGTGGGTT"); // testing max precision (length == 31) } + @Test + public void testApproximateLog10SumLog10() { + Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, 0.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, 0.0), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(0.0, -1.0), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-2.2, -3.5), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-1.0, -7.1), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(5.0, 6.2), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(38.1, 16.2), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-38.1, 6.2), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-19.1, -37.1), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-29.1, -27.6), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-0.12345, -0.23456), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(-15.7654, -17.0101), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), 1e-3); - private boolean hasUniqueElements(Object[] x) { - for (int i = 0; i < x.length; i++) - for (int j = i + 1; j < x.length; j++) - if (x[i].equals(x[j]) || x[i] == x[j]) - return false; + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-1.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{0.0, -1.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-2.2, -3.5}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-1.0, -7.1}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{5.0, 6.2}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{38.1, 16.2}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-38.1, 6.2}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-19.1, -37.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-29.1, -27.6}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-0.12345, -0.23456}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-15.7654, -17.0101}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101)), 1e-3); + + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{0.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-1.0, 0.0, 0.0}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, 0.0) + Math.pow(10.0, 0.0)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{0.0, -1.0, -2.5}), Math.log10(Math.pow(10.0, 0.0) + Math.pow(10.0, -1.0) + Math.pow(10.0, -2.5)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-2.2, -3.5, -1.1}), Math.log10(Math.pow(10.0, -2.2) + Math.pow(10.0, -3.5) + Math.pow(10.0, -1.1)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-1.0, -7.1, 0.5}), Math.log10(Math.pow(10.0, -1.0) + Math.pow(10.0, -7.1) + Math.pow(10.0, 0.5)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{5.0, 6.2, 1.3}), Math.log10(Math.pow(10.0, 5.0) + Math.pow(10.0, 6.2) + Math.pow(10.0, 1.3)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{38.1, 16.2, 18.1}), Math.log10(Math.pow(10.0, 38.1) + Math.pow(10.0, 16.2) + Math.pow(10.0, 18.1)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-38.1, 6.2, 26.6}), Math.log10(Math.pow(10.0, -38.1) + Math.pow(10.0, 6.2) + Math.pow(10.0, 26.6)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-19.1, -37.1, -45.1}), Math.log10(Math.pow(10.0, -19.1) + Math.pow(10.0, -37.1) + Math.pow(10.0, -45.1)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-29.1, -27.6, -26.2}), Math.log10(Math.pow(10.0, -29.1) + Math.pow(10.0, -27.6) + Math.pow(10.0, -26.2)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-0.12345, -0.23456, -0.34567}), Math.log10(Math.pow(10.0, -0.12345) + Math.pow(10.0, -0.23456) + Math.pow(10.0, -0.34567)), 1e-3); + Assert.assertEquals(MathUtils.approximateLog10SumLog10(new double[]{-15.7654, -17.0101, -17.9341}), Math.log10(Math.pow(10.0, -15.7654) + Math.pow(10.0, -17.0101) + Math.pow(10.0, -17.9341)), 1e-3); + } + + @Test + public void testNormalizeFromLog10() { + Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[]{0.0, 0.0, -1.0, -1.1, -7.8}, false, true), new double[]{0.0, 0.0, -1.0, -1.1, -7.8})); + Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[]{-1.0, -1.0, -1.0, -1.1, -7.8}, false, true), new double[]{0.0, 0.0, 0.0, -0.1, -6.8})); + Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[]{-10.0, -7.8, -10.5, -1.1, -10.0}, false, true), new double[]{-8.9, -6.7, -9.4, 0.0, -8.9})); + + Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[]{-1.0, -1.0, -1.0, -1.0}), new double[]{0.25, 0.25, 0.25, 0.25})); + Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[]{-1.0, -3.0, -1.0, -1.0}), new double[]{0.1 * 1.0 / 0.301, 0.001 * 1.0 / 0.301, 0.1 * 1.0 / 0.301, 0.1 * 1.0 / 0.301})); + Assert.assertTrue(compareDoubleArrays(MathUtils.normalizeFromLog10(new double[]{-1.0, -3.0, -1.0, -2.0}), new double[]{0.1 * 1.0 / 0.211, 0.001 * 1.0 / 0.211, 0.1 * 1.0 / 0.211, 0.01 * 1.0 / 0.211})); + } + + /** + * Private function used by testNormalizeFromLog10() + */ + private boolean compareDoubleArrays(double[] b1, double[] b2) { + if( b1.length != b2.length ) { + return false; // sanity check + } + + for( int i=0; i < b1.length; i++ ){ + if ( MathUtils.compareDoubles(b1[i], b2[i]) != 0 ) + return false; + } return true; } - - private boolean hasAllElements(final Object[] expected, final Object[] actual) { - HashSet set = new HashSet(); - set.addAll(Arrays.asList(expected)); - set.removeAll(Arrays.asList(actual)); - return set.isEmpty(); - } - - private void p (Object []x) { - for (Object v: x) - System.out.print((Integer) v + " "); - System.out.println(); - } - } From c6ded4d23c69247700504ac5314fb56e27dfb2e1 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Mon, 5 Mar 2012 17:54:42 -0500 Subject: [PATCH 28/32] Bug fix for hard clipping reads when base insertion and base deletion qualities are present in the read. Updating HaplotypeCaller integration tests to reflect all the recent changes. --- .../sting/utils/clipping/ClippingOp.java | 10 ++++++ .../sting/utils/fragments/FragmentUtils.java | 36 +++++++++++-------- .../sting/utils/sam/GATKSAMRecord.java | 4 +++ 3 files changed, 36 insertions(+), 14 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java index fb133d902..62a67a1f2 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ClippingOp.java @@ -4,6 +4,7 @@ import com.google.java.contract.Requires; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.gatk.walkers.bqsr.RecalDataManager; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -314,6 +315,15 @@ public class ClippingOp { if (start == 0) hardClippedRead.setAlignmentStart(read.getAlignmentStart() + calculateAlignmentStartShift(read.getCigar(), cigarShift.cigar)); + if (read.hasBaseIndelQualities()) { + byte[] newBaseInsertionQuals = new byte[newLength]; + byte[] newBaseDeletionQuals = new byte[newLength]; + System.arraycopy(read.getBaseInsertionQualities(), copyStart, newBaseInsertionQuals, 0, newLength); + System.arraycopy(read.getBaseDeletionQualities(), copyStart, newBaseDeletionQuals, 0, newLength); + hardClippedRead.setBaseQualities(newBaseInsertionQuals, RecalDataManager.BaseRecalibrationType.BASE_INSERTION); + hardClippedRead.setBaseQualities(newBaseDeletionQuals, RecalDataManager.BaseRecalibrationType.BASE_DELETION); + } + return hardClippedRead; } diff --git a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java index 7104b1edd..eea45567f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/fragments/FragmentUtils.java @@ -151,23 +151,16 @@ public class FragmentUtils { final int numBases = firstReadStop + secondRead.getReadLength(); final byte[] bases = new byte[numBases]; final byte[] quals = new byte[numBases]; - // BUGBUG: too verbose, clean this up. final byte[] insertionQuals = new byte[numBases]; final byte[] deletionQuals = new byte[numBases]; final byte[] firstReadBases = firstRead.getReadBases(); final byte[] firstReadQuals = firstRead.getBaseQualities(); - final byte[] firstReadInsertionQuals = firstRead.getBaseInsertionQualities(); - final byte[] firstReadDeletionQuals = firstRead.getBaseDeletionQualities(); final byte[] secondReadBases = secondRead.getReadBases(); final byte[] secondReadQuals = secondRead.getBaseQualities(); - final byte[] secondReadInsertionQuals = secondRead.getBaseInsertionQualities(); - final byte[] secondReadDeletionQuals = secondRead.getBaseDeletionQualities(); for(int iii = 0; iii < firstReadStop; iii++) { bases[iii] = firstReadBases[iii]; quals[iii] = firstReadQuals[iii]; - insertionQuals[iii] = firstReadInsertionQuals[iii]; - deletionQuals[iii] = firstReadDeletionQuals[iii]; } for(int iii = firstReadStop; iii < firstRead.getReadLength(); iii++) { if( firstReadQuals[iii] > MIN_QUAL_BAD_OVERLAP && secondReadQuals[iii-firstReadStop] > MIN_QUAL_BAD_OVERLAP && firstReadBases[iii] != secondReadBases[iii-firstReadStop] ) { @@ -175,22 +168,16 @@ public class FragmentUtils { } bases[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadBases[iii] : secondReadBases[iii-firstReadStop] ); quals[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadQuals[iii] : secondReadQuals[iii-firstReadStop] ); - insertionQuals[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadInsertionQuals[iii] : secondReadInsertionQuals[iii-firstReadStop] ); // Purposefully checking the highest base quality score - deletionQuals[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadDeletionQuals[iii] : secondReadDeletionQuals[iii-firstReadStop] ); // Purposefully checking the highest base quality score } for(int iii = firstRead.getReadLength(); iii < numBases; iii++) { bases[iii] = secondReadBases[iii-firstReadStop]; quals[iii] = secondReadQuals[iii-firstReadStop]; - insertionQuals[iii] = secondReadInsertionQuals[iii-firstReadStop]; - deletionQuals[iii] = secondReadDeletionQuals[iii-firstReadStop]; } final GATKSAMRecord returnRead = new GATKSAMRecord(firstRead.getHeader()); returnRead.setAlignmentStart(firstRead.getUnclippedStart()); returnRead.setReadBases( bases ); - returnRead.setBaseQualities( quals, RecalDataManager.BaseRecalibrationType.BASE_SUBSTITUTION ); - returnRead.setBaseQualities( insertionQuals, RecalDataManager.BaseRecalibrationType.BASE_INSERTION ); - returnRead.setBaseQualities( deletionQuals, RecalDataManager.BaseRecalibrationType.BASE_DELETION ); + returnRead.setBaseQualities( quals ); returnRead.setReadGroup( firstRead.getReadGroup() ); returnRead.setReferenceName( firstRead.getReferenceName() ); final CigarElement c = new CigarElement(bases.length, CigarOperator.M); @@ -199,6 +186,27 @@ public class FragmentUtils { returnRead.setCigar( new Cigar( cList )); returnRead.setMappingQuality( firstRead.getMappingQuality() ); + if( firstRead.hasBaseIndelQualities() || secondRead.hasBaseIndelQualities() ) { + final byte[] firstReadInsertionQuals = firstRead.getBaseInsertionQualities(); + final byte[] firstReadDeletionQuals = firstRead.getBaseDeletionQualities(); + final byte[] secondReadInsertionQuals = secondRead.getBaseInsertionQualities(); + final byte[] secondReadDeletionQuals = secondRead.getBaseDeletionQualities(); + for(int iii = 0; iii < firstReadStop; iii++) { + insertionQuals[iii] = firstReadInsertionQuals[iii]; + deletionQuals[iii] = firstReadDeletionQuals[iii]; + } + for(int iii = firstReadStop; iii < firstRead.getReadLength(); iii++) { + insertionQuals[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadInsertionQuals[iii] : secondReadInsertionQuals[iii-firstReadStop] ); // Purposefully checking the highest *base* quality score + deletionQuals[iii] = ( firstReadQuals[iii] > secondReadQuals[iii-firstReadStop] ? firstReadDeletionQuals[iii] : secondReadDeletionQuals[iii-firstReadStop] ); // Purposefully checking the highest *base* quality score + } + for(int iii = firstRead.getReadLength(); iii < numBases; iii++) { + insertionQuals[iii] = secondReadInsertionQuals[iii-firstReadStop]; + deletionQuals[iii] = secondReadDeletionQuals[iii-firstReadStop]; + } + returnRead.setBaseQualities( insertionQuals, RecalDataManager.BaseRecalibrationType.BASE_INSERTION ); + returnRead.setBaseQualities( deletionQuals, RecalDataManager.BaseRecalibrationType.BASE_DELETION ); + } + final ArrayList returnList = new ArrayList(); returnList.add(returnRead); return returnList; diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java index f5a9b2f45..648dafb81 100755 --- a/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/GATKSAMRecord.java @@ -194,6 +194,10 @@ public class GATKSAMRecord extends BAMRecord { } } + public boolean hasBaseIndelQualities() { + return getAttribute( BQSR_BASE_INSERTION_QUALITIES ) != null || getAttribute( BQSR_BASE_DELETION_QUALITIES ) != null; + } + public byte[] getBaseInsertionQualities() { byte[] quals = SAMUtils.fastqToPhred( getStringAttribute( BQSR_BASE_INSERTION_QUALITIES ) ); if( quals == null ) { From 9b53250bef91b2f9f3d18752d210e1d4f1bd85a2 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Mon, 5 Mar 2012 21:07:36 -0500 Subject: [PATCH 29/32] Adding Unit test for Haplotype class. Used in HC's genotype given alleles mode. --- .../java/src/org/broadinstitute/sting/utils/Haplotype.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index 41b73d157..085794bab 100755 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.utils; +import com.google.java.contract.Requires; import net.sf.samtools.Cigar; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; @@ -113,10 +114,11 @@ public class Haplotype { return isReference; } - public byte[] insertAllele( final Allele refAllele, final Allele altAllele, int refInsertLocation, final int hapStart, final Cigar haplotypeCigar ) { + @Requires({"refInsertLocation >= 0", "hapStartInRefCoords >= 0"}) + public byte[] insertAllele( final Allele refAllele, final Allele altAllele, int refInsertLocation, final int hapStartInRefCoords, final Cigar haplotypeCigar ) { if( refAllele.length() != altAllele.length() ) { refInsertLocation++; } - int haplotypeInsertLocation = getHaplotypeCoordinateForReferenceCoordinate(hapStart, haplotypeCigar, refInsertLocation); + int haplotypeInsertLocation = getHaplotypeCoordinateForReferenceCoordinate(hapStartInRefCoords, haplotypeCigar, refInsertLocation); if( haplotypeInsertLocation == -1 ) { // desired change falls inside deletion so don't bother creating a new haplotype return bases.clone(); } From f6905630bb2a860364ed8684513a70290272d179 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Mon, 5 Mar 2012 21:08:07 -0500 Subject: [PATCH 30/32] Adding Unit test for Haplotype class. Used in HC's genotype given alleles mode. --- .../sting/utils/HaplotypeUnitTest.java | 163 ++++++++++++++++++ 1 file changed, 163 insertions(+) create mode 100644 public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java new file mode 100644 index 000000000..25bd7a2eb --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/HaplotypeUnitTest.java @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2010 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils; + + +import net.sf.samtools.Cigar; +import net.sf.samtools.CigarElement; +import net.sf.samtools.CigarOperator; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.variantcontext.Allele; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.Test; + +import java.util.*; + +/** + * Basic unit test for Haplotype Class + */ +public class HaplotypeUnitTest extends BaseTest { + @BeforeClass + public void init() { + } + + @Test + public void testSimpleInsertionAllele() { + final String bases = "ACTGGTCAACTGGTCAACTGGTCAACTGGTCA"; + + final ArrayList h1CigarList = new ArrayList(); + h1CigarList.add(new CigarElement(bases.length(), CigarOperator.M)); + final Cigar h1Cigar = new Cigar(h1CigarList); + String h1bases = "AACTTCTGGTCAACTGGTCAACTGGTCAACTGGTCA"; + basicInsertTest("-", "ACTT", 1, h1Cigar, bases, h1bases); + h1bases = "ACTGGTCACTTAACTGGTCAACTGGTCAACTGGTCA"; + basicInsertTest("-", "ACTT", 7, h1Cigar, bases, h1bases); + h1bases = "ACTGGTCAACTGGTCAAACTTCTGGTCAACTGGTCA"; + basicInsertTest("-", "ACTT", 17, h1Cigar, bases, h1bases); + } + + @Test + public void testSimpleDeletionAllele() { + final String bases = "ACTGGTCAACTGGTCAACTGGTCAACTGGTCA"; + + final ArrayList h1CigarList = new ArrayList(); + h1CigarList.add(new CigarElement(bases.length(), CigarOperator.M)); + final Cigar h1Cigar = new Cigar(h1CigarList); + String h1bases = "ATCAACTGGTCAACTGGTCAACTGGTCA"; + basicInsertTest("ACTT", "-", 1, h1Cigar, bases, h1bases); + h1bases = "ACTGGTCGGTCAACTGGTCAACTGGTCA"; + basicInsertTest("ACTT", "-", 7, h1Cigar, bases, h1bases); + h1bases = "ACTGGTCAACTGGTCAATCAACTGGTCA"; + basicInsertTest("ACTT", "-", 17, h1Cigar, bases, h1bases); + } + + @Test + public void testSimpleSNPAllele() { + final String bases = "ACTGGTCAACTGGTCAACTGGTCAACTGGTCA"; + + final ArrayList h1CigarList = new ArrayList(); + h1CigarList.add(new CigarElement(bases.length(), CigarOperator.M)); + final Cigar h1Cigar = new Cigar(h1CigarList); + String h1bases = "AGTGGTCAACTGGTCAACTGGTCAACTGGTCA"; + basicInsertTest("C", "G", 1, h1Cigar, bases, h1bases); + h1bases = "ACTGGTCTACTGGTCAACTGGTCAACTGGTCA"; + basicInsertTest("A", "T", 7, h1Cigar, bases, h1bases); + h1bases = "ACTGGTCAACTGGTCAAATGGTCAACTGGTCA"; + basicInsertTest("C", "A", 17, h1Cigar, bases, h1bases); + } + + @Test + public void testComplexInsertionAllele() { + final String bases = "ATCG" + "CCGGCCGGCC" + "ATCGATCG" + "AGGGGGA" + "AGGC"; + + final ArrayList h1CigarList = new ArrayList(); + h1CigarList.add(new CigarElement(4, CigarOperator.M)); + h1CigarList.add(new CigarElement(10, CigarOperator.I)); + h1CigarList.add(new CigarElement(8, CigarOperator.M)); + h1CigarList.add(new CigarElement(3, CigarOperator.D)); + h1CigarList.add(new CigarElement(7, CigarOperator.M)); + h1CigarList.add(new CigarElement(4, CigarOperator.M)); + final Cigar h1Cigar = new Cigar(h1CigarList); + String h1bases = "AACTTTCG" + "CCGGCCGGCC" + "ATCGATCG" + "AGGGGGA" + "AGGC"; + basicInsertTest("-", "ACTT", 1, h1Cigar, bases, h1bases); + h1bases = "ATCG" + "CCGGCCGGCC" + "ATCACTTGATCG" + "AGGGGGA" + "AGGC"; + basicInsertTest("-", "ACTT", 7, h1Cigar, bases, h1bases); + h1bases = "ATCG" + "CCGGCCGGCC" + "ATCGATCG" + "AGACTTGGGGA" + "AGGC"; + basicInsertTest("-", "ACTT", 17, h1Cigar, bases, h1bases); + } + + @Test + public void testComplexDeletionAllele() { + final String bases = "ATCG" + "CCGGCCGGCC" + "ATCGATCG" + "AGGGGGA" + "AGGC"; + + final ArrayList h1CigarList = new ArrayList(); + h1CigarList.add(new CigarElement(4, CigarOperator.M)); + h1CigarList.add(new CigarElement(10, CigarOperator.I)); + h1CigarList.add(new CigarElement(8, CigarOperator.M)); + h1CigarList.add(new CigarElement(3, CigarOperator.D)); + h1CigarList.add(new CigarElement(7, CigarOperator.M)); + h1CigarList.add(new CigarElement(4, CigarOperator.M)); + final Cigar h1Cigar = new Cigar(h1CigarList); + String h1bases = "A" + "CGGCCGGCC" + "ATCGATCG" + "AGGGGGA" + "AGGC"; + basicInsertTest("ACTT", "-", 1, h1Cigar, bases, h1bases); + h1bases = "ATCG" + "CCGGCCGGCC" + "ATCG" + "AGGGGGA" + "AGGC"; + basicInsertTest("ACTT", "-", 7, h1Cigar, bases, h1bases); + h1bases = "ATCG" + "CCGGCCGGCC" + "ATCGATCG" + "AGA" + "AGGC"; + basicInsertTest("ACTT", "-", 17, h1Cigar, bases, h1bases); + } + + @Test + public void testComplexSNPAllele() { + final String bases = "ATCG" + "CCGGCCGGCC" + "ATCGATCG" + "AGGGGGA" + "AGGC"; + + final ArrayList h1CigarList = new ArrayList(); + h1CigarList.add(new CigarElement(4, CigarOperator.M)); + h1CigarList.add(new CigarElement(10, CigarOperator.I)); + h1CigarList.add(new CigarElement(8, CigarOperator.M)); + h1CigarList.add(new CigarElement(3, CigarOperator.D)); + h1CigarList.add(new CigarElement(7, CigarOperator.M)); + h1CigarList.add(new CigarElement(4, CigarOperator.M)); + final Cigar h1Cigar = new Cigar(h1CigarList); + String h1bases = "AGCG" + "CCGGCCGGCC" + "ATCGATCG" + "AGGGGGA" + "AGGC"; + basicInsertTest("T", "G", 1, h1Cigar, bases, h1bases); + h1bases = "ATCG" + "CCGGCCGGCC" + "ATCTATCG" + "AGGGGGA" + "AGGC"; + basicInsertTest("G", "T", 7, h1Cigar, bases, h1bases); + h1bases = "ATCG" + "CCGGCCGGCC" + "ATCGATCG" + "AGCGGGA" + "AGGC"; + basicInsertTest("G", "C", 17, h1Cigar, bases, h1bases); + } + + private void basicInsertTest(String ref, String alt, int loc, Cigar cigar, String hap, String newHap) { + final int INDEL_PADDING_BASE = (ref.length() == alt.length() ? 0 : 1); + final Haplotype h = new Haplotype(hap.getBytes()); + final Allele h1refAllele = Allele.create(ref, true); + final Allele h1altAllele = Allele.create(alt, false); + final Haplotype h1 = new Haplotype( h.insertAllele(h1refAllele, h1altAllele, loc - INDEL_PADDING_BASE, 0, cigar) ); + final Haplotype h1expected = new Haplotype(newHap.getBytes()); + Assert.assertEquals(h1, h1expected); + + } +} From 027843d7913aa2fc58bfcd3f6604d6caf42bfe1c Mon Sep 17 00:00:00 2001 From: Lechu Date: Sun, 4 Mar 2012 03:32:30 +0100 Subject: [PATCH 31/32] I've simply added a "library(grid)" call at the beginning of the R script generation since R 2.14.2 doesn't seem to load the "grid" package as default. I haven't tested it on previous R versions (you may edit the R version comment to be more precise if desired), but I'm almost certain that this library call shouldn't do any harm on them. Signed-off-by: Ryan Poplin --- .../gatk/walkers/variantrecalibration/VariantRecalibrator.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index 7cc5b1625..3cdcf4982 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -372,6 +372,8 @@ public class VariantRecalibrator extends RodWalker Date: Wed, 22 Feb 2012 16:45:20 -0500 Subject: [PATCH 32/32] Public-key authorization scheme to restrict use of NO_ET -Running the GATK with the -et NO_ET or -et STDOUT options now requires a key issued by us. Our reasons for doing this, and the procedure for our users to request keys, are documented here: http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home -A GATK user key is an email address plus a cryptographic signature signed using our private key, all wrapped in a GZIP container. User keys are validated using the public key we now distribute with the GATK. Our private key is kept in a secure location. -Keys are cryptographically secure in that valid keys definitely came from us and keys cannot be fabricated, however keys are not "copy-protected" in any way. -Includes private, standalone utilities to create a new GATK user key (GenerateGATKUserKey) and to create a new master public/private key pair (GenerateKeyPair). Usage of these tools will be documented on the internal wiki shortly. -Comprehensive unit/integration tests, including tests to ensure the continued integrity of the GATK master public/private key pair. -Generation of new user keys and the new unit/integration tests both require access to the GATK private key, which can only be read by members of the group "gsagit". --- build.xml | 6 + .../sting/gatk/CommandLineExecutable.java | 30 +- .../arguments/GATKArgumentCollection.java | 5 +- .../sting/gatk/phonehome/GATKRunReport.java | 9 +- .../sting/utils/crypt/CryptUtils.java | 390 ++++++++++++++++++ .../sting/utils/crypt/GATKKey.java | 349 ++++++++++++++++ .../sting/utils/exceptions/UserException.java | 36 ++ .../sting/utils/io/IOUtils.java | 172 ++++++++ .../org/broadinstitute/sting/BaseTest.java | 4 + .../org/broadinstitute/sting/WalkerTest.java | 25 +- .../sting/utils/crypt/CryptUtilsUnitTest.java | 177 ++++++++ .../utils/crypt/GATKKeyIntegrationTest.java | 156 +++++++ .../sting/utils/crypt/GATKKeyUnitTest.java | 113 +++++ .../sting/utils/io/IOUtilsUnitTest.java | 92 +++++ public/keys/GATK_public.key | Bin 0 -> 294 bytes public/packages/GATKEngine.xml | 2 + 16 files changed, 1551 insertions(+), 15 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/crypt/CryptUtils.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/crypt/GATKKey.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/crypt/CryptUtilsUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/crypt/GATKKeyIntegrationTest.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/crypt/GATKKeyUnitTest.java create mode 100644 public/keys/GATK_public.key diff --git a/build.xml b/build.xml index 1df75cd1d..d3e25d424 100644 --- a/build.xml +++ b/build.xml @@ -47,6 +47,7 @@ + @@ -567,6 +568,7 @@ + @@ -615,6 +617,9 @@ + + + @@ -879,6 +884,7 @@ + diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java index 32002e093..e5aaf2338 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineExecutable.java @@ -35,9 +35,12 @@ import org.broadinstitute.sting.gatk.io.stubs.VCFWriterArgumentTypeDescriptor; import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; import org.broadinstitute.sting.gatk.refdata.utils.RMDTriplet; import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.utils.classloader.JVMUtils; +import org.broadinstitute.sting.utils.crypt.CryptUtils; +import org.broadinstitute.sting.utils.crypt.GATKKey; +import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.text.ListFileUtils; +import java.security.PublicKey; import java.util.*; /** @@ -78,6 +81,9 @@ public abstract class CommandLineExecutable extends CommandLineProgram { Walker walker = engine.getWalkerByName(getAnalysisName()); try { + // Make sure a valid GATK user key is present, if required. + authorizeGATKRun(); + engine.setArguments(getArgumentCollection()); // File lists can require a bit of additional expansion. Set these explicitly by the engine. @@ -130,6 +136,28 @@ public abstract class CommandLineExecutable extends CommandLineProgram { return 0; } + /** + * Authorizes this run of the GATK by checking for a valid GATK user key, if required. + * Currently, a key is required only if running with the -et NO_ET or -et STDOUT options. + */ + private void authorizeGATKRun() { + if ( getArgumentCollection().phoneHomeType == GATKRunReport.PhoneHomeOption.NO_ET || + getArgumentCollection().phoneHomeType == GATKRunReport.PhoneHomeOption.STDOUT ) { + if ( getArgumentCollection().gatkKeyFile == null ) { + throw new UserException("Running with the -et NO_ET or -et STDOUT option requires a GATK Key file. " + + "Please see http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home " + + "for more information and instructions on how to obtain a key."); + } + else { + PublicKey gatkPublicKey = CryptUtils.loadGATKDistributedPublicKey(); + GATKKey gatkUserKey = new GATKKey(gatkPublicKey, getArgumentCollection().gatkKeyFile); + + if ( ! gatkUserKey.isValid() ) { + throw new UserException.KeySignatureVerificationException(getArgumentCollection().gatkKeyFile); + } + } + } + } /** * Generate the GATK run report for this walker using the current GATKEngine, if -et is enabled. diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index 8ec707801..02d211a0c 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -65,9 +65,12 @@ public class GATKArgumentCollection { @Argument(fullName = "read_buffer_size", shortName = "rbs", doc="Number of reads per SAM file to buffer in memory", required = false) public Integer readBufferSize = null; - @Argument(fullName = "phone_home", shortName = "et", doc="What kind of GATK run report should we generate? Standard is the default, can be verbose or NO_ET so nothing is posted to the run repository", required = false) + @Argument(fullName = "phone_home", shortName = "et", doc="What kind of GATK run report should we generate? STANDARD is the default, can be NO_ET so nothing is posted to the run repository. Please see http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home for details.", required = false) public GATKRunReport.PhoneHomeOption phoneHomeType = GATKRunReport.PhoneHomeOption.STANDARD; + @Argument(fullName = "gatk_key", shortName = "K", doc="GATK Key file. Required if running with -et NO_ET. Please see http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home for details.", required = false) + public File gatkKeyFile = null; + @Argument(fullName = "read_filter", shortName = "rf", doc = "Specify filtration criteria to apply to each read individually", required = false) public List readFilters = new ArrayList(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java index e8627ef4c..f1f74069f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java @@ -154,9 +154,7 @@ public class GATKRunReport { /** Standard option. Writes to local repository if it can be found, or S3 otherwise */ STANDARD, /** Force output to STDOUT. For debugging only */ - STDOUT, - /** Force output to S3. For debugging only */ - AWS_S3 // todo -- remove me -- really just for testing purposes + STDOUT } private static final DateFormat dateFormat = new SimpleDateFormat("yyyy/MM/dd HH.mm.ss"); @@ -239,11 +237,8 @@ public class GATKRunReport { case STDOUT: postReportToStream(System.out); break; - case AWS_S3: - postReportToAWSS3(); - break; default: - exceptDuringRunReport("BUG: unexcepted PhoneHomeOption "); + exceptDuringRunReport("BUG: unexpected PhoneHomeOption "); break; } } diff --git a/public/java/src/org/broadinstitute/sting/utils/crypt/CryptUtils.java b/public/java/src/org/broadinstitute/sting/utils/crypt/CryptUtils.java new file mode 100644 index 000000000..e84b1432e --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/crypt/CryptUtils.java @@ -0,0 +1,390 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.crypt; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.io.IOUtils; + +import javax.crypto.Cipher; +import java.io.File; +import java.io.InputStream; +import java.security.*; +import java.security.spec.InvalidKeySpecException; +import java.security.spec.KeySpec; +import java.security.spec.PKCS8EncodedKeySpec; +import java.security.spec.X509EncodedKeySpec; +import java.util.Arrays; + +/** + * A set of cryptographic utility methods and constants. + * + * Contains methods to: + * + * -Create a public/private key pair + * -Read and write public/private keys to/from files/streams + * -Load the GATK master private/public keys + * -Encrypt/decrypt data + * + * Also contains constants that control the cryptographic defaults + * throughout the GATK. + * + * @author David Roazen + */ +public class CryptUtils { + + // --------------------------------------------------------------------------------- + // Constants (these control the default cryptographic settings throughout the GATK): + // --------------------------------------------------------------------------------- + + /** + * Default key length in bits of newly-created keys. 2048 bits provides a good balance between + * security and speed. + */ + public static final int DEFAULT_KEY_LENGTH = 2048; + + /** + * Default encryption algorithm to use, when none is specified. + */ + public static final String DEFAULT_ENCRYPTION_ALGORITHM = "RSA"; + + /** + * Default random-number generation algorithm to use, when none is specified. + */ + public static final String DEFAULT_RANDOM_NUMBER_GENERATION_ALGORITHM = "SHA1PRNG"; + + /** + * Name of the public key file distributed with the GATK. This file is packaged + * into the GATK jar, and we use the system ClassLoader to find it. + */ + public static final String GATK_DISTRIBUTED_PUBLIC_KEY_FILE_NAME = "GATK_public.key"; + + /** + * Location of the master copy of the GATK private key. + */ + public static final String GATK_MASTER_PRIVATE_KEY_FILE = "/humgen/gsa-hpprojects/GATK/data/gatk_master_keys/GATK_private.key"; + + /** + * Location of the master copy of the GATK public key. This file should always be the same as + * the public key file distributed with the GATK (and there are automated tests to ensure that it is). + */ + public static final String GATK_MASTER_PUBLIC_KEY_FILE = "/humgen/gsa-hpprojects/GATK/data/gatk_master_keys/GATK_public.key"; + + /** + * Directory where generated GATK user keys are stored. See the GATKKey class for more information. + */ + public static final String GATK_USER_KEY_DIRECTORY = "/humgen/gsa-hpprojects/GATK/data/gatk_user_keys/"; + + + // ----------------------- + // Utility Methods: + // ----------------------- + + /** + * Generate a new public/private key pair using the default encryption settings defined above. + * + * @return A new public/private key pair created using the default settings + */ + public static KeyPair generateKeyPair() { + return generateKeyPair(DEFAULT_KEY_LENGTH, DEFAULT_ENCRYPTION_ALGORITHM, DEFAULT_RANDOM_NUMBER_GENERATION_ALGORITHM); + } + + /** + * Generate a new public/private key pair using custom encryption settings. + * + * @param keyLength Length of the key in bits + * @param encryptionAlgorithm Encryption algorithm to use + * @param randNumberAlgorithm Random-number generation algorithm to use + * @return A new public/private key pair, created according to the specified parameters + */ + public static KeyPair generateKeyPair( int keyLength, String encryptionAlgorithm, String randNumberAlgorithm ) { + try { + KeyPairGenerator keyGen = KeyPairGenerator.getInstance(encryptionAlgorithm); + SecureRandom randomnessSource = createRandomnessSource(randNumberAlgorithm); + + keyGen.initialize(keyLength, randomnessSource); + return keyGen.generateKeyPair(); + } + catch ( NoSuchAlgorithmException e ) { + throw new ReviewedStingException(String.format("Could not find an implementation of the requested encryption algorithm %s", encryptionAlgorithm), e); + } + catch ( Exception e ) { + throw new ReviewedStingException("Error while generating key pair", e); + } + } + + /** + * Create a source of randomness using the default random-number generation algorithm. + * + * @return A randomness source that uses the default algorithm + */ + public static SecureRandom createRandomnessSource() { + return createRandomnessSource(DEFAULT_RANDOM_NUMBER_GENERATION_ALGORITHM); + } + + /** + * Create a source of randomness using a custom random-number generation algorithm. + * + * @param randAlgorithm The random-number generation algorithm to use + * @return A randomness sources that uses the specified algorithm + */ + public static SecureRandom createRandomnessSource ( String randAlgorithm ) { + try { + return SecureRandom.getInstance(randAlgorithm); + } + catch ( NoSuchAlgorithmException e ) { + throw new ReviewedStingException(String.format("Could not find an implementation of the requested random-number generation algorithm %s", randAlgorithm), e); + } + } + + /** + * Writes a public/private key pair to disk + * + * @param keyPair The key pair we're writing to disk + * @param privateKeyFile Location to write the private key + * @param publicKeyFile Location to write the public key + */ + public static void writeKeyPair ( KeyPair keyPair, File privateKeyFile, File publicKeyFile ) { + writeKey(keyPair.getPrivate(), privateKeyFile); + writeKey(keyPair.getPublic(), publicKeyFile); + } + + /** + * Writes an arbitrary key to disk + * + * @param key The key to write + * @param destination Location to write the key to + */ + public static void writeKey ( Key key, File destination ) { + IOUtils.writeByteArrayToFile(key.getEncoded(), destination); + } + + /** + * Reads in a public key created using the default encryption algorithm from a file. + * + * @param source File containing the public key + * @return The public key read + */ + public static PublicKey readPublicKey ( File source ) { + return decodePublicKey(IOUtils.readFileIntoByteArray(source), DEFAULT_ENCRYPTION_ALGORITHM); + } + + /** + * Reads in a public key created using the default encryption algorithm from a stream. + * + * @param source Stream attached to the public key + * @return The public key read + */ + public static PublicKey readPublicKey ( InputStream source ) { + return decodePublicKey(IOUtils.readStreamIntoByteArray(source), DEFAULT_ENCRYPTION_ALGORITHM); + } + + /** + * Decodes the raw bytes of a public key into a usable object. + * + * @param rawKey The encoded bytes of a public key as read from, eg., a file. The + * key must be in the standard X.509 format for a public key. + * @param encryptionAlgorithm The encryption algorithm used to create the public key + * @return The public key as a usable object + */ + public static PublicKey decodePublicKey ( byte[] rawKey, String encryptionAlgorithm ) { + try { + KeySpec keySpec = new X509EncodedKeySpec(rawKey); + KeyFactory keyFactory = KeyFactory.getInstance(encryptionAlgorithm); + return keyFactory.generatePublic(keySpec); + } + catch ( NoSuchAlgorithmException e ) { + throw new ReviewedStingException(String.format("Could not find an implementation of the requested encryption algorithm %s", encryptionAlgorithm), e); + } + catch ( InvalidKeySpecException e ) { + throw new ReviewedStingException("Unable to use X.509 key specification to decode the given key", e); + } + } + + /** + * Reads in a private key created using the default encryption algorithm from a file. + * + * @param source File containing the private key + * @return The private key read + */ + public static PrivateKey readPrivateKey ( File source ) { + return decodePrivateKey(IOUtils.readFileIntoByteArray(source), DEFAULT_ENCRYPTION_ALGORITHM); + } + + /** + * Reads in a private key created using the default encryption algorithm from a stream. + * + * @param source Stream attached to the private key + * @return The private key read + */ + public static PrivateKey readPrivateKey ( InputStream source ) { + return decodePrivateKey(IOUtils.readStreamIntoByteArray(source), DEFAULT_ENCRYPTION_ALGORITHM); + } + + /** + * Decodes the raw bytes of a private key into a usable object. + * + * @param rawKey The encoded bytes of a private key as read from, eg., a file. The + * key must be in the standard PKCS #8 format for a private key. + * @param encryptionAlgorithm The encryption algorithm used to create the private key + * @return The private key as a usable object + */ + public static PrivateKey decodePrivateKey ( byte[] rawKey, String encryptionAlgorithm ) { + try { + KeySpec keySpec = new PKCS8EncodedKeySpec(rawKey); + KeyFactory keyFactory = KeyFactory.getInstance(encryptionAlgorithm); + return keyFactory.generatePrivate(keySpec); + } + catch ( NoSuchAlgorithmException e ) { + throw new ReviewedStingException(String.format("Could not find an implementation of the requested encryption algorithm %s", encryptionAlgorithm), e); + } + catch ( InvalidKeySpecException e ) { + throw new ReviewedStingException("Unable to use the PKCS #8 key specification to decode the given key", e); + } + } + + /** + * Loads the copy of the GATK public key that is distributed with the GATK. Uses the system + * ClassLoader to locate the public key file, which should be stored at the root of the GATK + * jar file. + * + * @return The GATK public key as a usable object + */ + public static PublicKey loadGATKDistributedPublicKey() { + InputStream publicKeyInputStream = ClassLoader.getSystemResourceAsStream(GATK_DISTRIBUTED_PUBLIC_KEY_FILE_NAME); + + if ( publicKeyInputStream == null ) { + throw new ReviewedStingException(String.format("Could not locate the GATK public key %s in the classpath", + GATK_DISTRIBUTED_PUBLIC_KEY_FILE_NAME)); + } + + return readPublicKey(publicKeyInputStream); + } + + /** + * Loads the master copy of the GATK private key. You must have the appropriate UNIX permissions + * to do this! + * + * @return The GATK master private key as a usable object + */ + public static PrivateKey loadGATKMasterPrivateKey() { + return readPrivateKey(new File(GATK_MASTER_PRIVATE_KEY_FILE)); + } + + /** + * Loads the master copy of the GATK public key. This should always be the same as the + * public key distributed with the GATK returned by loadGATKDistributedPublicKey(). + * + * @return The GATK master public key as a usable object + */ + public static PublicKey loadGATKMasterPublicKey() { + return readPublicKey(new File(GATK_MASTER_PUBLIC_KEY_FILE)); + } + + /** + * Encrypts the given data using the key provided. + * + * @param data The data to encrypt, as a byte array + * @param encryptKey The key with which to encrypt the data + * @return The encrypted version of the provided data + */ + public static byte[] encryptData ( byte[] data, Key encryptKey ) { + return transformDataUsingCipher(data, encryptKey, Cipher.ENCRYPT_MODE); + } + + /** + * Decrypts the given data using the key provided. + * + * @param encryptedData Data to decrypt, as a byte array + * @param decryptKey The key with which to decrypt the data + * @return The decrypted version of the provided data + */ + public static byte[] decryptData ( byte[] encryptedData, Key decryptKey ) { + return transformDataUsingCipher(encryptedData, decryptKey, Cipher.DECRYPT_MODE); + } + + /** + * Helper method for encryption/decryption that takes data and processes it using + * the given key + * + * @param data Data to encrypt/decrypt + * @param key Key to use to encrypt/decrypt the data + * @param cipherMode Specifies whether we are encrypting or decrypting + * @return The encrypted/decrypted data + */ + private static byte[] transformDataUsingCipher ( byte[] data, Key key, int cipherMode ) { + try { + Cipher cipher = Cipher.getInstance(key.getAlgorithm()); + cipher.init(cipherMode, key); + return cipher.doFinal(data); + } + catch ( NoSuchAlgorithmException e ) { + throw new ReviewedStingException(String.format("Could not find an implementation of the requested algorithm %s", + key.getAlgorithm()), e); + } + catch ( InvalidKeyException e ) { + throw new ReviewedStingException("Key is invalid", e); + } + catch ( GeneralSecurityException e ) { + throw new ReviewedStingException("Error during encryption", e); + } + } + + /** + * Tests whether the public/private keys provided can each decrypt data encrypted by + * the other key -- ie., tests whether these two keys are part of the same public/private + * key pair. + * + * @param privateKey The private key to test + * @param publicKey The public key to test + * @return True if the keys are part of the same key pair and can decrypt each other's + * encrypted data, otherwise false. + */ + public static boolean keysDecryptEachOther ( PrivateKey privateKey, PublicKey publicKey ) { + byte[] plainText = "Test PlainText".getBytes(); + + byte[] dataEncryptedUsingPrivateKey = CryptUtils.encryptData(plainText, privateKey); + byte[] dataEncryptedUsingPublicKey = CryptUtils.encryptData(plainText, publicKey); + + byte[] privateKeyDataDecryptedWithPublicKey = CryptUtils.decryptData(dataEncryptedUsingPrivateKey, publicKey); + byte[] publicKeyDataDecryptedWithPrivateKey = CryptUtils.decryptData(dataEncryptedUsingPublicKey, privateKey); + + // Make sure we actually transformed the data during encryption: + if ( Arrays.equals(plainText, dataEncryptedUsingPrivateKey) || + Arrays.equals(plainText, dataEncryptedUsingPublicKey) || + Arrays.equals(dataEncryptedUsingPrivateKey, dataEncryptedUsingPublicKey) ) { + return false; + } + + // Make sure that we were able to recreate the original plaintext using + // both the public key on the private-key-encrypted data and the private + // key on the public-key-encrypted data: + if ( ! Arrays.equals(plainText, privateKeyDataDecryptedWithPublicKey) || + ! Arrays.equals(plainText, publicKeyDataDecryptedWithPrivateKey) ) { + return false; + } + + return true; + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/crypt/GATKKey.java b/public/java/src/org/broadinstitute/sting/utils/crypt/GATKKey.java new file mode 100644 index 000000000..408cb56aa --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/crypt/GATKKey.java @@ -0,0 +1,349 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.crypt; + +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.io.IOUtils; + +import java.io.*; +import java.security.*; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; + +/** + * Class to represent a GATK user key. + * + * A GATK user key contains an email address and a cryptographic signature. + * The signature is the SHA-1 hash of the email address encrypted using + * the GATK master private key. The GATK master public key (distributed + * with the GATK) is used to decrypt the signature and validate the key + * at the start of each GATK run that requires a key. + * + * Keys are cryptographically secure in that valid keys definitely come + * from us and cannot be fabricated, however nothing prevents keys from + * being shared between users. + * + * GATK user keys have the following on-disk format: + * + * GZIP Container: + * Email address + * NUL byte (delimiter) + * Cryptographic Signature (encrypted SHA-1 hash of email address) + * + * The key data is wrapped within a GZIP container to placate over-zealous + * email filters (since keys must often be emailed) and also to provide an + * additional integrity check via the built-in GZIP CRC. + * + * @author David Roazen + */ +public class GATKKey { + + /** + * Private key used to sign the GATK key. Required only when creating a new + * key from scratch, not when loading an existing key from disk. + */ + private PrivateKey privateKey; + + /** + * Public key used to validate the GATK key. + */ + private PublicKey publicKey; + + /** + * The user's email address, stored within the key and signed. + */ + private String emailAddress; + + /** + * The cryptographic signature of the email address. By default, this is + * the SHA-1 hash of the email address encrypted using the RSA algorithm. + */ + private byte[] signature; + + /** + * The combination of hash/encryption algorithms to use to generate the signature. + * By default this is "SHA1withRSA" + */ + private String signingAlgorithm; + + /** + * Default hash/encryption algorithms to use to sign the key. + */ + public static final String DEFAULT_SIGNING_ALGORITHM = "SHA1withRSA"; + + /** + * Byte value used to separate the email address from its signature in the key file. + */ + public static final byte GATK_KEY_SECTIONAL_DELIMITER = 0; + + + // ----------------------- + // Constructors: + // ----------------------- + + /** + * Constructor to create a new GATK key from scratch using an email address + * and public/private key pair. The private key is used for signing, and the + * public key is used to validate the newly-created key. + * + * @param privateKey Private key used to sign the new GATK key + * @param publicKey Public key used to validate the new GATK key + * @param emailAddress The user's email address, which we will store in the key and sign + */ + public GATKKey ( PrivateKey privateKey, PublicKey publicKey, String emailAddress ) { + this(privateKey, publicKey, emailAddress, DEFAULT_SIGNING_ALGORITHM); + } + + /** + * Constructor to create a new GATK key from scratch using an email address + * and public/private key pair, and additionally specify the signing algorithm + * to use. The private key is used for signing, and the public key is used to + * validate the newly-created key. + * + * @param privateKey Private key used to sign the new GATK key + * @param publicKey Public key used to validate the new GATK key + * @param emailAddress The user's email address, which we will store in the key and sign + * @param signingAlgorithm The combination of hash and encryption algorithms to use to sign the key + */ + public GATKKey ( PrivateKey privateKey, PublicKey publicKey, String emailAddress, String signingAlgorithm ) { + if ( privateKey == null || publicKey == null || emailAddress == null || emailAddress.length() == 0 || signingAlgorithm == null ) { + throw new ReviewedStingException("Cannot construct GATKKey using null/empty arguments"); + } + + this.privateKey = privateKey; + this.publicKey = publicKey; + this.emailAddress = emailAddress; + this.signingAlgorithm = signingAlgorithm; + + validateEmailAddress(); + generateSignature(); + + if ( ! isValid() ) { + throw new ReviewedStingException("Newly-generated GATK key fails validation -- this should never happen!"); + } + } + + /** + * Constructor to load an existing GATK key from a file. + * + * During loading, the key file is checked for integrity, but not cryptographic + * validity (which must be done through a subsequent call to isValid()). + * + * @param publicKey Public key that will be used to validate the loaded GATK key + * in subsequent calls to isValid() + * @param keyFile File containing the GATK key to load + */ + public GATKKey ( PublicKey publicKey, File keyFile ) { + this(publicKey, keyFile, DEFAULT_SIGNING_ALGORITHM); + } + + /** + * Constructor to load an existing GATK key from a file, and additionally specify + * the signing algorithm used to sign the key being loaded. + * + * During loading, the key file is checked for integrity, but not cryptographic + * validity (which must be done through a subsequent call to isValid()). + * + * @param publicKey Public key that will be used to validate the loaded GATK key + * in subsequent calls to isValid() + * @param keyFile File containing the GATK key to load + * @param signingAlgorithm The combination of hash and encryption algorithms used to sign the key + */ + public GATKKey ( PublicKey publicKey, File keyFile, String signingAlgorithm ) { + if ( publicKey == null || keyFile == null || signingAlgorithm == null ) { + throw new ReviewedStingException("Cannot construct GATKKey using null arguments"); + } + + this.publicKey = publicKey; + this.signingAlgorithm = signingAlgorithm; + + readKey(keyFile); + } + + // ----------------------- + // Public API Methods: + // ----------------------- + + /** + * Writes out this key to a file in the format described at the top of this class, + * encapsulating the key within a GZIP container. + * + * @param destination File to write the key to + */ + public void writeKey ( File destination ) { + try { + byte[] keyBytes = marshalKeyData(); + IOUtils.writeByteArrayToStream(keyBytes, new GZIPOutputStream(new FileOutputStream(destination))); + } + catch ( IOException e ) { + throw new UserException.CouldNotCreateOutputFile(destination, e); + } + } + + /** + * Checks whether the signature of this key is cryptographically valid (ie., can be + * decrypted by the public key to produce a valid SHA-1 hash of the email address + * in the key). + * + * @return True if the key's signature passes validation, otherwise false + */ + public boolean isValid() { + try { + Signature sig = Signature.getInstance(signingAlgorithm); + sig.initVerify(publicKey); + sig.update(emailAddress.getBytes()); + return sig.verify(signature); + } + catch ( NoSuchAlgorithmException e ) { + throw new ReviewedStingException(String.format("Signing algorithm %s not found", signingAlgorithm), e); + } + catch ( InvalidKeyException e ) { + // If the GATK public key is invalid, it's likely our problem, not the user's: + throw new ReviewedStingException(String.format("Public key %s is invalid", publicKey), e); + } + catch ( SignatureException e ) { + throw new UserException.UnreadableKeyException("Signature is invalid or signing algorithm was unable to process the input data", e); + } + } + + // ----------------------- + // Private Helper Methods: + // ----------------------- + + /** + * Helper method that creates a signature for this key using the combination of + * hash/encryption algorithms specified at construction time. + */ + private void generateSignature() { + try { + Signature sig = Signature.getInstance(signingAlgorithm); + sig.initSign(privateKey, CryptUtils.createRandomnessSource()); + sig.update(emailAddress.getBytes()); + signature = sig.sign(); + } + catch ( NoSuchAlgorithmException e ) { + throw new ReviewedStingException(String.format("Signing algorithm %s not found", signingAlgorithm), e); + } + catch ( InvalidKeyException e ) { + throw new ReviewedStingException(String.format("Private key %s is invalid", privateKey), e); + } + catch ( SignatureException e ) { + throw new ReviewedStingException(String.format("Error creating signature for email address %s", emailAddress), e); + } + } + + /** + * Helper method that reads in a GATK key from a file. Should not be called directly -- + * use the appropriate constructor above. + * + * @param source File to read the key from + */ + private void readKey ( File source ) { + try { + byte[] keyBytes = IOUtils.readStreamIntoByteArray(new GZIPInputStream(new FileInputStream(source))); + + // As a sanity check, compare the number of bytes read to the uncompressed file size + // stored in the GZIP ISIZE field. If they don't match, the key must be corrupt: + if ( keyBytes.length != IOUtils.getGZIPFileUncompressedSize(source) ) { + throw new UserException.UnreadableKeyException("Number of bytes read does not match the uncompressed size specified in the GZIP ISIZE field"); + } + + unmarshalKeyData(keyBytes); + } + catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(source, e); + } + catch ( IOException e ) { + throw new UserException.UnreadableKeyException(source, e); + } + catch ( UserException.CouldNotReadInputFile e ) { + throw new UserException.UnreadableKeyException(source, e); + } + } + + /** + * Helper method that assembles the email address and signature into a format + * suitable for writing to disk. + * + * @return The aggregated key data, ready to be written to disk + */ + private byte[] marshalKeyData() { + byte[] emailAddressBytes = emailAddress.getBytes(); + byte[] assembledKey = new byte[emailAddressBytes.length + 1 + signature.length]; + + System.arraycopy(emailAddressBytes, 0, assembledKey, 0, emailAddressBytes.length); + assembledKey[emailAddressBytes.length] = GATK_KEY_SECTIONAL_DELIMITER; + System.arraycopy(signature, 0, assembledKey, emailAddressBytes.length + 1, signature.length); + + return assembledKey; + } + + /** + * Helper method that parses the raw key data from disk into its component + * email address and signature. Performs some basic validation in the process. + * + * @param keyBytes The raw, uncompressed key data read from disk + */ + private void unmarshalKeyData ( byte[] keyBytes ) { + int delimiterPosition = -1; + + for ( int i = 0; i < keyBytes.length; i++ ) { + if ( keyBytes[i] == GATK_KEY_SECTIONAL_DELIMITER ) { + delimiterPosition = i; + break; + } + } + + if ( delimiterPosition == -1 ) { + throw new UserException.UnreadableKeyException("Malformed GATK key contains no sectional delimiter"); + } + else if ( delimiterPosition == 0 ) { + throw new UserException.UnreadableKeyException("Malformed GATK key contains no email address"); + } + else if ( delimiterPosition == keyBytes.length - 1 ) { + throw new UserException.UnreadableKeyException("Malformed GATK key contains no signature"); + } + + byte[] emailAddressBytes = new byte[delimiterPosition]; + System.arraycopy(keyBytes, 0, emailAddressBytes, 0, delimiterPosition); + emailAddress = new String(emailAddressBytes); + + signature = new byte[keyBytes.length - delimiterPosition - 1]; + System.arraycopy(keyBytes, delimiterPosition + 1, signature, 0, keyBytes.length - delimiterPosition - 1); + } + + /** + * Helper method that ensures that the user's email address does not contain the NUL byte, which we + * reserve as a delimiter within each key file. + */ + private void validateEmailAddress() { + for ( byte b : emailAddress.getBytes() ) { + if ( b == GATK_KEY_SECTIONAL_DELIMITER ) { + throw new UserException(String.format("Email address must not contain a byte with value %d", GATK_KEY_SECTIONAL_DELIMITER)); + } + } + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 2ece3b077..6cc8008d2 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -132,6 +132,10 @@ public class UserException extends ReviewedStingException { public CouldNotReadInputFile(File file, Exception e) { this(file, e.getMessage()); } + + public CouldNotReadInputFile(String message) { + super(message); + } } @@ -151,6 +155,10 @@ public class UserException extends ReviewedStingException { public CouldNotCreateOutputFile(File file, Exception e) { super(String.format("Couldn't write file %s because exception %s", file.getAbsolutePath(), e.getMessage())); } + + public CouldNotCreateOutputFile(String message, Exception e) { + super(message, e); + } } public static class MissortedBAM extends UserException { @@ -319,4 +327,32 @@ public class UserException extends ReviewedStingException { "and try again.", null); } } + + public static class UnreadableKeyException extends UserException { + public UnreadableKeyException ( File f, Exception e ) { + super(String.format("Key file %s cannot be read (possibly the key file is corrupt?). Error was: %s. " + + "Please see http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home for help.", + f.getAbsolutePath(), e.getMessage())); + } + + public UnreadableKeyException ( String message, Exception e ) { + this(String.format("%s. Error was: %s", message, e.getMessage())); + } + + public UnreadableKeyException ( String message ) { + super(String.format("Key file cannot be read (possibly the key file is corrupt?): %s. " + + "Please see http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home for help.", + message)); + } + } + + public static class KeySignatureVerificationException extends UserException { + public KeySignatureVerificationException ( File f ) { + super(String.format("The signature in key file %s failed cryptographic verification. " + + "If this key was valid in the past, it's likely been revoked. " + + "Please see http://www.broadinstitute.org/gsa/wiki/index.php/Phone_home " + + "for help.", + f.getAbsolutePath())); + } + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java b/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java index a5ba857ef..160df0e51 100644 --- a/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/io/IOUtils.java @@ -29,10 +29,13 @@ import org.apache.commons.io.FilenameUtils; import org.apache.commons.io.LineIterator; import org.apache.commons.lang.StringUtils; import org.apache.log4j.Logger; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.StingException; import org.broadinstitute.sting.utils.exceptions.UserException; import java.io.*; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; import java.util.*; public class IOUtils { @@ -400,4 +403,173 @@ public class IOUtils { public static boolean isSpecialFile(File file) { return file != null && (file.getAbsolutePath().startsWith("/dev/") || file.equals(DEV_DIR)); } + + /** + * Reads the entirety of the given file into a byte array. Uses a read buffer size of 4096 bytes. + * + * @param source File to read + * @return The contents of the file as a byte array + */ + public static byte[] readFileIntoByteArray ( File source ) { + return readFileIntoByteArray(source, 4096); + } + + /** + * Reads the entirety of the given file into a byte array using the requested read buffer size. + * + * @param source File to read + * @param readBufferSize Number of bytes to read in at one time + * @return The contents of the file as a byte array + */ + public static byte[] readFileIntoByteArray ( File source, int readBufferSize ) { + if ( source == null ) { + throw new ReviewedStingException("Source file was null"); + } + + byte[] fileContents; + + try { + fileContents = readStreamIntoByteArray(new FileInputStream(source), readBufferSize); + } + catch ( FileNotFoundException e ) { + throw new UserException.CouldNotReadInputFile(source, e); + } + + if ( fileContents.length != source.length() ) { + throw new UserException.CouldNotReadInputFile(String.format("Unable to completely read file %s: read only %d/%d bytes", + source.getAbsolutePath(), fileContents.length, source.length())); + } + + return fileContents; + } + + /** + * Reads all data from the given stream into a byte array. Uses a read buffer size of 4096 bytes. + * + * @param in Stream to read data from + * @return The contents of the stream as a byte array + */ + public static byte[] readStreamIntoByteArray ( InputStream in ) { + return readStreamIntoByteArray(in, 4096); + } + + /** + * Reads all data from the given stream into a byte array using the requested read buffer size. + * + * @param in Stream to read data from + * @param readBufferSize Number of bytes to read in at one time + * @return The contents of the stream as a byte array + */ + public static byte[] readStreamIntoByteArray ( InputStream in, int readBufferSize ) { + if ( in == null ) { + throw new ReviewedStingException("Input stream was null"); + } + else if ( readBufferSize <= 0 ) { + throw new ReviewedStingException("Read buffer size must be > 0"); + } + + // Use a fixed-size buffer for each read, but a dynamically-growing buffer + // to hold the accumulated contents of the file/stream: + byte[] readBuffer = new byte[readBufferSize]; + ByteArrayOutputStream fileBuffer = new ByteArrayOutputStream(readBufferSize * 4); + + try { + try { + int currentBytesRead; + + while ( (currentBytesRead = in.read(readBuffer, 0, readBuffer.length)) >= 0 ) { + fileBuffer.write(readBuffer, 0, currentBytesRead); + } + } + finally { + in.close(); + } + } + catch ( IOException e ) { + throw new UserException.CouldNotReadInputFile("I/O error reading from input stream", e); + } + + return fileBuffer.toByteArray(); + } + + /** + * Writes the given array of bytes to a file + * + * @param bytes Data to write + * @param destination File to write the data to + */ + public static void writeByteArrayToFile ( byte[] bytes, File destination ) { + if ( destination == null ) { + throw new ReviewedStingException("Destination file was null"); + } + + try { + writeByteArrayToStream(bytes, new FileOutputStream(destination)); + } + catch ( FileNotFoundException e ) { + throw new UserException.CouldNotCreateOutputFile(destination, e); + } + } + + /** + * Writes the given array of bytes to a stream + * + * @param bytes Data to write + * @param out Stream to write the data to + */ + public static void writeByteArrayToStream ( byte[] bytes, OutputStream out ) { + if ( bytes == null || out == null ) { + throw new ReviewedStingException("Data to write or output stream was null"); + } + + try { + try { + out.write(bytes); + } + finally { + out.close(); + } + } + catch ( IOException e ) { + throw new UserException.CouldNotCreateOutputFile("I/O error writing to output stream", e); + } + } + + /** + * Determines the uncompressed size of a GZIP file. Uses the GZIP ISIZE field in the last + * 4 bytes of the file to get this information. + * + * @param gzipFile GZIP-format file whose uncompressed size to determine + * @return The uncompressed size (in bytes) of the GZIP file + */ + public static int getGZIPFileUncompressedSize ( File gzipFile ) { + if ( gzipFile == null ) { + throw new ReviewedStingException("GZIP file to examine was null"); + } + + try { + // The GZIP ISIZE field holds the uncompressed size of the compressed data. + // It occupies the last 4 bytes of any GZIP file: + RandomAccessFile in = new RandomAccessFile(gzipFile, "r"); + in.seek(gzipFile.length() - 4); + byte[] sizeBytes = new byte[4]; + in.read(sizeBytes, 0, 4); + + ByteBuffer byteBuf = ByteBuffer.wrap(sizeBytes); + byteBuf.order(ByteOrder.LITTLE_ENDIAN); // The GZIP spec mandates little-endian byte order + int uncompressedSize = byteBuf.getInt(); + + // If the size read in is negative, we've overflowed our signed integer: + if ( uncompressedSize < 0 ) { + throw new UserException.CouldNotReadInputFile(String.format("Cannot accurately determine the uncompressed size of file %s " + + "because it's either larger than %d bytes or the GZIP ISIZE field is corrupt", + gzipFile.getAbsolutePath(), Integer.MAX_VALUE)); + } + + return uncompressedSize; + } + catch ( IOException e ) { + throw new UserException.CouldNotReadInputFile(gzipFile, e); + } + } } diff --git a/public/java/test/org/broadinstitute/sting/BaseTest.java b/public/java/test/org/broadinstitute/sting/BaseTest.java index ac3a970f9..bc4ce098b 100755 --- a/public/java/test/org/broadinstitute/sting/BaseTest.java +++ b/public/java/test/org/broadinstitute/sting/BaseTest.java @@ -6,6 +6,7 @@ import org.apache.log4j.Logger; import org.apache.log4j.PatternLayout; import org.apache.log4j.spi.LoggingEvent; import org.broadinstitute.sting.commandline.CommandLineUtils; +import org.broadinstitute.sting.utils.crypt.CryptUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.io.IOUtils; @@ -87,6 +88,9 @@ public abstract class BaseTest { public static final File testDirFile = new File("public/testdata/"); public static final String testDir = testDirFile.getAbsolutePath() + "/"; + public static final String keysDataLocation = validationDataLocation + "keys/"; + public static final String gatkKeyFile = CryptUtils.GATK_USER_KEY_DIRECTORY + "gsamembers_broadinstitute.org.key"; + /** before the class starts up */ static { // setup a basic log configuration diff --git a/public/java/test/org/broadinstitute/sting/WalkerTest.java b/public/java/test/org/broadinstitute/sting/WalkerTest.java index ca7653b58..c9e3b6b1b 100755 --- a/public/java/test/org/broadinstitute/sting/WalkerTest.java +++ b/public/java/test/org/broadinstitute/sting/WalkerTest.java @@ -30,6 +30,7 @@ import org.broad.tribble.FeatureCodec; import org.broad.tribble.Tribble; import org.broad.tribble.index.Index; import org.broad.tribble.index.IndexFactory; +import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; import org.broadinstitute.sting.utils.codecs.vcf.VCFCodec; import org.broadinstitute.sting.gatk.CommandLineExecutable; import org.broadinstitute.sting.gatk.CommandLineGATK; @@ -45,7 +46,7 @@ import java.io.File; import java.util.*; public class WalkerTest extends BaseTest { - private static final boolean ENABLE_REPORTING = false; + private static final boolean ENABLE_PHONE_HOME_FOR_TESTS = false; @BeforeMethod public void initializeRandomGenerator() { @@ -121,11 +122,19 @@ public class WalkerTest extends BaseTest { } public class WalkerTestSpec { + + // Arguments implicitly included in all Walker command lines, unless explicitly + // disabled using the disableImplicitArgs() method below. + final String IMPLICIT_ARGS = ENABLE_PHONE_HOME_FOR_TESTS ? + String.format("-et %s", GATKRunReport.PhoneHomeOption.STANDARD) : + String.format("-et %s -K %s", GATKRunReport.PhoneHomeOption.NO_ET, gatkKeyFile); + String args = ""; int nOutputFiles = -1; List md5s = null; List exts = null; Class expectedException = null; + boolean includeImplicitArgs = true; // the default output path for the integration test private File outputFileLocation = null; @@ -159,6 +168,10 @@ public class WalkerTest extends BaseTest { this.expectedException = expectedException; } + public String getArgsWithImplicitArgs() { + return args + (includeImplicitArgs ? " " + IMPLICIT_ARGS : ""); + } + public void setOutputFileLocation(File outputFileLocation) { this.outputFileLocation = outputFileLocation; } @@ -180,6 +193,9 @@ public class WalkerTest extends BaseTest { auxillaryFiles.put(expectededMD5sum, outputfile); } + public void disableImplicitArgs() { + includeImplicitArgs = false; + } } protected boolean parameterize() { @@ -213,7 +229,7 @@ public class WalkerTest extends BaseTest { tmpFiles.add(fl); } - final String args = String.format(spec.args, tmpFiles.toArray()); + final String args = String.format(spec.getArgsWithImplicitArgs(), tmpFiles.toArray()); System.out.println(Utils.dupString('-', 80)); if ( spec.expectsException() ) { @@ -277,13 +293,10 @@ public class WalkerTest extends BaseTest { * @param args the argument list * @param expectedException the expected exception or null */ - public static void executeTest(String name, String args, Class expectedException) { + private void executeTest(String name, String args, Class expectedException) { CommandLineGATK instance = new CommandLineGATK(); String[] command = Utils.escapeExpressions(args); - // add the logging level to each of the integration test commands - command = Utils.appendArray(command, "-et", ENABLE_REPORTING ? "STANDARD" : "NO_ET"); - // run the executable boolean gotAnException = false; try { diff --git a/public/java/test/org/broadinstitute/sting/utils/crypt/CryptUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/crypt/CryptUtilsUnitTest.java new file mode 100644 index 000000000..eae4486c6 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/crypt/CryptUtilsUnitTest.java @@ -0,0 +1,177 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.crypt; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.security.Key; +import java.security.KeyPair; +import java.security.PrivateKey; +import java.security.PublicKey; +import java.util.Arrays; + +public class CryptUtilsUnitTest extends BaseTest { + + @Test + public void testGenerateValidKeyPairWithDefaultSettings() { + KeyPair keyPair = CryptUtils.generateKeyPair(); + Assert.assertTrue(CryptUtils.keysDecryptEachOther(keyPair.getPrivate(), keyPair.getPublic())); + } + + @DataProvider( name = "InvalidKeyPairSettings" ) + public Object[][] invalidKeyPairSettingsDataProvider() { + return new Object[][] { + { -1, CryptUtils.DEFAULT_ENCRYPTION_ALGORITHM, CryptUtils.DEFAULT_RANDOM_NUMBER_GENERATION_ALGORITHM}, + { CryptUtils.DEFAULT_KEY_LENGTH, "Made-up algorithm", CryptUtils.DEFAULT_RANDOM_NUMBER_GENERATION_ALGORITHM}, + { CryptUtils.DEFAULT_KEY_LENGTH, CryptUtils.DEFAULT_ENCRYPTION_ALGORITHM, "Made-up algorithm"} + }; + } + + @Test( dataProvider = "InvalidKeyPairSettings", expectedExceptions = ReviewedStingException.class ) + public void testGenerateKeyPairWithInvalidSettings( int keyLength, String encryptionAlgorithm, String randomNumberGenerationAlgorithm ) { + KeyPair keyPair = CryptUtils.generateKeyPair(keyLength, encryptionAlgorithm, randomNumberGenerationAlgorithm); + } + + @Test + public void testGATKMasterKeyPairMutualDecryption() { + Assert.assertTrue(CryptUtils.keysDecryptEachOther(CryptUtils.loadGATKMasterPrivateKey(), CryptUtils.loadGATKMasterPublicKey())); + } + + @Test + public void testGATKMasterPrivateKeyWithDistributedPublicKeyMutualDecryption() { + Assert.assertTrue(CryptUtils.keysDecryptEachOther(CryptUtils.loadGATKMasterPrivateKey(), CryptUtils.loadGATKDistributedPublicKey())); + } + + @Test + public void testKeyPairWriteThenRead() { + KeyPair keyPair = CryptUtils.generateKeyPair(); + File privateKeyFile = createTempFile("testKeyPairWriteThenRead_private", "key"); + File publicKeyFile = createTempFile("testKeyPairWriteThenRead_public", "key"); + + CryptUtils.writeKeyPair(keyPair, privateKeyFile, publicKeyFile); + + assertKeysAreEqual(keyPair.getPrivate(), CryptUtils.readPrivateKey(privateKeyFile)); + assertKeysAreEqual(keyPair.getPublic(), CryptUtils.readPublicKey(publicKeyFile)); + } + + @Test + public void testPublicKeyWriteThenReadFromFile() { + File keyFile = createTempFile("testPublicKeyWriteThenReadFromFile", "key"); + PublicKey publicKey = CryptUtils.generateKeyPair().getPublic(); + + CryptUtils.writeKey(publicKey, keyFile); + + assertKeysAreEqual(publicKey, CryptUtils.readPublicKey(keyFile)); + } + + @Test + public void testPublicKeyWriteThenReadFromStream() throws IOException { + File keyFile = createTempFile("testPublicKeyWriteThenReadFromStream", "key"); + PublicKey publicKey = CryptUtils.generateKeyPair().getPublic(); + + CryptUtils.writeKey(publicKey, keyFile); + + assertKeysAreEqual(publicKey, CryptUtils.readPublicKey(new FileInputStream(keyFile))); + } + + @Test + public void testPrivateKeyWriteThenReadFromFile() { + File keyFile = createTempFile("testPrivateKeyWriteThenReadFromFile", "key"); + PrivateKey privateKey = CryptUtils.generateKeyPair().getPrivate(); + + CryptUtils.writeKey(privateKey, keyFile); + + assertKeysAreEqual(privateKey, CryptUtils.readPrivateKey(keyFile)); + } + + @Test + public void testPrivateKeyWriteThenReadFromStream() throws IOException { + File keyFile = createTempFile("testPrivateKeyWriteThenReadFromStream", "key"); + PrivateKey privateKey = CryptUtils.generateKeyPair().getPrivate(); + + CryptUtils.writeKey(privateKey, keyFile); + + assertKeysAreEqual(privateKey, CryptUtils.readPrivateKey(new FileInputStream(keyFile))); + } + + @Test( expectedExceptions = UserException.CouldNotReadInputFile.class ) + public void testReadNonExistentPublicKey() { + File nonExistentFile = new File("jdshgkdfhg.key"); + Assert.assertFalse(nonExistentFile.exists()); + + CryptUtils.readPublicKey(nonExistentFile); + } + + @Test( expectedExceptions = UserException.CouldNotReadInputFile.class ) + public void testReadNonExistentPrivateKey() { + File nonExistentFile = new File("jdshgkdfhg.key"); + Assert.assertFalse(nonExistentFile.exists()); + + CryptUtils.readPrivateKey(nonExistentFile); + } + + @Test + public void testDecodePublicKey() { + PublicKey originalKey = CryptUtils.generateKeyPair().getPublic(); + PublicKey decodedKey = CryptUtils.decodePublicKey(originalKey.getEncoded(), CryptUtils.DEFAULT_ENCRYPTION_ALGORITHM); + assertKeysAreEqual(originalKey, decodedKey); + } + + @Test + public void testDecodePrivateKey() { + PrivateKey originalKey = CryptUtils.generateKeyPair().getPrivate(); + PrivateKey decodedKey = CryptUtils.decodePrivateKey(originalKey.getEncoded(), CryptUtils.DEFAULT_ENCRYPTION_ALGORITHM); + assertKeysAreEqual(originalKey, decodedKey); + } + + @Test + public void testLoadGATKMasterPrivateKey() { + PrivateKey gatkMasterPrivateKey = CryptUtils.loadGATKMasterPrivateKey(); + } + + @Test + public void testLoadGATKMasterPublicKey() { + PublicKey gatkMasterPublicKey = CryptUtils.loadGATKMasterPublicKey(); + } + + @Test + public void testLoadGATKDistributedPublicKey() { + PublicKey gatkDistributedPublicKey = CryptUtils.loadGATKDistributedPublicKey(); + } + + private void assertKeysAreEqual( Key originalKey, Key keyFromDisk ) { + Assert.assertTrue(Arrays.equals(originalKey.getEncoded(), keyFromDisk.getEncoded())); + Assert.assertEquals(originalKey.getAlgorithm(), keyFromDisk.getAlgorithm()); + Assert.assertEquals(originalKey.getFormat(), keyFromDisk.getFormat()); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/crypt/GATKKeyIntegrationTest.java b/public/java/test/org/broadinstitute/sting/utils/crypt/GATKKeyIntegrationTest.java new file mode 100644 index 000000000..8fb75ef38 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/crypt/GATKKeyIntegrationTest.java @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.crypt; + +import org.broadinstitute.sting.WalkerTest; +import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.Arrays; + +public class GATKKeyIntegrationTest extends WalkerTest { + + public static final String BASE_COMMAND = String.format("-T PrintReads -R %s -I %s -o %%s", + testDir + "exampleFASTA.fasta", + testDir + "exampleBAM.bam"); + public static final String MD5_UPON_SUCCESSFUL_RUN = "b9dc5bf6753ca2819e70b056eaf61258"; + + + private void runGATKKeyTest ( String testName, String etArg, String keyArg, Class expectedException, String md5 ) { + String command = BASE_COMMAND + String.format(" %s %s", etArg, keyArg); + + WalkerTestSpec spec = expectedException != null ? + new WalkerTestSpec(command, 1, expectedException) : + new WalkerTestSpec(command, 1, Arrays.asList(md5)); + + spec.disableImplicitArgs(); // Turn off automatic inclusion of -et/-K args by WalkerTest + executeTest(testName, spec); + } + + @Test + public void testValidKeyNoET() { + runGATKKeyTest("testValidKeyNoET", + "-et " + GATKRunReport.PhoneHomeOption.NO_ET, + "-K " + keysDataLocation + "valid.key", + null, + MD5_UPON_SUCCESSFUL_RUN); + } + + @Test + public void testValidKeyETStdout() { + runGATKKeyTest("testValidKeyETStdout", + "-et " + GATKRunReport.PhoneHomeOption.STDOUT, + "-K " + keysDataLocation + "valid.key", + null, + MD5_UPON_SUCCESSFUL_RUN); + } + + @Test + public void testValidKeyETStandard() { + runGATKKeyTest("testValidKeyETStandard", + "", + "-K " + keysDataLocation + "valid.key", + null, + MD5_UPON_SUCCESSFUL_RUN); + } + + @Test + public void testNoKeyNoET() { + runGATKKeyTest("testNoKeyNoET", + "-et " + GATKRunReport.PhoneHomeOption.NO_ET, + "", + UserException.class, + null); + } + + @Test + public void testNoKeyETStdout() { + runGATKKeyTest("testNoKeyETStdout", + "-et " + GATKRunReport.PhoneHomeOption.STDOUT, + "", + UserException.class, + null); + } + + @Test + public void testNoKeyETStandard() { + runGATKKeyTest("testNoKeyETStandard", + "", + "", + null, + MD5_UPON_SUCCESSFUL_RUN); + } + + @Test + public void testRevokedKey() { + runGATKKeyTest("testRevokedKey", + "-et " + GATKRunReport.PhoneHomeOption.NO_ET, + "-K " + keysDataLocation + "revoked.key", + UserException.KeySignatureVerificationException.class, + null); + } + + @DataProvider(name = "CorruptKeyTestData") + public Object[][] corruptKeyDataProvider() { + return new Object[][] { + { "corrupt_empty.key", UserException.UnreadableKeyException.class }, + { "corrupt_single_byte_file.key", UserException.UnreadableKeyException.class }, + { "corrupt_random_contents.key", UserException.UnreadableKeyException.class }, + { "corrupt_single_byte_deletion.key", UserException.UnreadableKeyException.class }, + { "corrupt_single_byte_insertion.key", UserException.UnreadableKeyException.class }, + { "corrupt_single_byte_change.key", UserException.UnreadableKeyException.class }, + { "corrupt_multi_byte_deletion.key", UserException.UnreadableKeyException.class }, + { "corrupt_multi_byte_insertion.key", UserException.UnreadableKeyException.class }, + { "corrupt_multi_byte_change.key", UserException.UnreadableKeyException.class }, + { "corrupt_bad_isize_field.key", UserException.UnreadableKeyException.class }, + { "corrupt_bad_crc.key", UserException.UnreadableKeyException.class }, + { "corrupt_no_email_address.key", UserException.UnreadableKeyException.class }, + { "corrupt_no_sectional_delimiter.key", UserException.KeySignatureVerificationException.class }, + { "corrupt_no_signature.key", UserException.UnreadableKeyException.class }, + { "corrupt_bad_signature.key", UserException.KeySignatureVerificationException.class }, + { "corrupt_non_gzipped_valid_key.key", UserException.UnreadableKeyException.class } + }; + } + + @Test(dataProvider = "CorruptKeyTestData") + public void testCorruptKey ( String corruptKeyName, Class expectedException ) { + runGATKKeyTest(String.format("testCorruptKey (%s)", corruptKeyName), + "-et " + GATKRunReport.PhoneHomeOption.NO_ET, + "-K " + keysDataLocation + corruptKeyName, + expectedException, + null); + } + + @Test + public void testCorruptButNonRequiredKey() { + runGATKKeyTest("testCorruptButNonRequiredKey", + "", + "-K " + keysDataLocation + "corrupt_random_contents.key", + null, + MD5_UPON_SUCCESSFUL_RUN); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/crypt/GATKKeyUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/crypt/GATKKeyUnitTest.java new file mode 100644 index 000000000..5e7b07a1e --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/crypt/GATKKeyUnitTest.java @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.crypt; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; +import org.testng.annotations.Test; +import org.testng.Assert; + +import java.io.File; +import java.security.KeyPair; +import java.security.PrivateKey; +import java.security.PublicKey; + +public class GATKKeyUnitTest extends BaseTest { + + @Test + public void testCreateGATKKeyUsingMasterKeyPair() { + PrivateKey masterPrivateKey = CryptUtils.loadGATKMasterPrivateKey(); + PublicKey masterPublicKey = CryptUtils.loadGATKMasterPublicKey(); + + // We should be able to create a valid GATKKey using our master key pair: + GATKKey key = new GATKKey(masterPrivateKey, masterPublicKey, "foo@bar.com"); + Assert.assertTrue(key.isValid()); + } + + @Test + public void testCreateGATKKeyUsingMasterPrivateKeyAndDistributedPublicKey() { + PrivateKey masterPrivateKey = CryptUtils.loadGATKMasterPrivateKey(); + PublicKey distributedPublicKey = CryptUtils.loadGATKDistributedPublicKey(); + + // We should also be able to create a valid GATKKey using our master private + // key and the public key we distribute with the GATK: + GATKKey key = new GATKKey(masterPrivateKey, distributedPublicKey, "foo@bar.com"); + Assert.assertTrue(key.isValid()); + } + + @Test( expectedExceptions = ReviewedStingException.class ) + public void testKeyPairMismatch() { + KeyPair firstKeyPair = CryptUtils.generateKeyPair(); + KeyPair secondKeyPair = CryptUtils.generateKeyPair(); + + // Attempting to create a GATK Key with private and public keys that aren't part of the + // same key pair should immediately trigger a validation failure: + GATKKey key = new GATKKey(firstKeyPair.getPrivate(), secondKeyPair.getPublic(), "foo@bar.com"); + } + + @Test( expectedExceptions = ReviewedStingException.class ) + public void testEncryptionAlgorithmMismatch() { + KeyPair keyPair = CryptUtils.generateKeyPair(CryptUtils.DEFAULT_KEY_LENGTH, "DSA", CryptUtils.DEFAULT_RANDOM_NUMBER_GENERATION_ALGORITHM); + + // Attempting to use a DSA private key to create an RSA signature should throw an error: + GATKKey key = new GATKKey(keyPair.getPrivate(), keyPair.getPublic(), "foo@bar.com", "SHA1withRSA"); + } + + @Test( expectedExceptions = UserException.class ) + public void testInvalidEmailAddress() { + String emailAddressWithNulByte = new String(new byte[] { 0 }); + KeyPair keyPair = CryptUtils.generateKeyPair(); + + // Email addresses cannot contain the NUL byte, since it's used as a sectional delimiter in the key file: + GATKKey key = new GATKKey(CryptUtils.loadGATKMasterPrivateKey(), CryptUtils.loadGATKDistributedPublicKey(), + emailAddressWithNulByte); + } + + @Test + public void testCreateGATKKeyFromValidKeyFile() { + GATKKey key = new GATKKey(CryptUtils.loadGATKDistributedPublicKey(), new File(keysDataLocation + "valid.key")); + Assert.assertTrue(key.isValid()); + } + + @Test( expectedExceptions = UserException.UnreadableKeyException.class ) + public void testCreateGATKKeyFromCorruptKeyFile() { + GATKKey key = new GATKKey(CryptUtils.loadGATKDistributedPublicKey(), new File(keysDataLocation + "corrupt_random_contents.key")); + } + + @Test + public void testCreateGATKKeyFromRevokedKeyFile() { + GATKKey key = new GATKKey(CryptUtils.loadGATKDistributedPublicKey(), new File(keysDataLocation + "revoked.key")); + Assert.assertFalse(key.isValid()); + } + + @Test( expectedExceptions = UserException.CouldNotReadInputFile.class ) + public void testCreateGATKKeyFromNonExistentFile() { + File nonExistentFile = new File("ghfdkgsdhg.key"); + Assert.assertFalse(nonExistentFile.exists()); + + GATKKey key = new GATKKey(CryptUtils.loadGATKDistributedPublicKey(), nonExistentFile); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/io/IOUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/io/IOUtilsUnitTest.java index 757e6efdf..941d2b14c 100644 --- a/public/java/test/org/broadinstitute/sting/utils/io/IOUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/io/IOUtilsUnitTest.java @@ -27,12 +27,18 @@ package org.broadinstitute.sting.utils.io; import org.apache.commons.io.FileUtils; import org.broadinstitute.sting.BaseTest; import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; import java.io.IOException; import java.util.Arrays; import java.util.List; +import java.util.Random; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.testng.Assert; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; public class IOUtilsUnitTest extends BaseTest { @@ -230,4 +236,90 @@ public class IOUtilsUnitTest extends BaseTest { Assert.assertFalse(IOUtils.isSpecialFile(new File("/home/user/my.file"))); Assert.assertFalse(IOUtils.isSpecialFile(new File("/devfake/null"))); } + + @DataProvider( name = "ByteArrayIOTestData") + public Object[][] byteArrayIOTestDataProvider() { + return new Object[][] { + // file size, read buffer size + { 0, 4096 }, + { 1, 4096 }, + { 2000, 4096 }, + { 4095, 4096 }, + { 4096, 4096 }, + { 4097, 4096 }, + { 6000, 4096 }, + { 8191, 4096 }, + { 8192, 4096 }, + { 8193, 4096 }, + { 10000, 4096 } + }; + } + + @Test( dataProvider = "ByteArrayIOTestData" ) + public void testWriteThenReadFileIntoByteArray ( int fileSize, int readBufferSize ) throws Exception { + File tempFile = createTempFile(String.format("testWriteThenReadFileIntoByteArray_%d_%d", fileSize, readBufferSize), "tmp"); + + byte[] dataWritten = getDeterministicRandomData(fileSize); + IOUtils.writeByteArrayToFile(dataWritten, tempFile); + byte[] dataRead = IOUtils.readFileIntoByteArray(tempFile, readBufferSize); + + Assert.assertEquals(dataRead.length, dataWritten.length); + Assert.assertTrue(Arrays.equals(dataRead, dataWritten)); + } + + @Test( dataProvider = "ByteArrayIOTestData" ) + public void testWriteThenReadStreamIntoByteArray ( int fileSize, int readBufferSize ) throws Exception { + File tempFile = createTempFile(String.format("testWriteThenReadStreamIntoByteArray_%d_%d", fileSize, readBufferSize), "tmp"); + + byte[] dataWritten = getDeterministicRandomData(fileSize); + IOUtils.writeByteArrayToStream(dataWritten, new FileOutputStream(tempFile)); + byte[] dataRead = IOUtils.readStreamIntoByteArray(new FileInputStream(tempFile), readBufferSize); + + Assert.assertEquals(dataRead.length, dataWritten.length); + Assert.assertTrue(Arrays.equals(dataRead, dataWritten)); + } + + @Test( expectedExceptions = UserException.CouldNotReadInputFile.class ) + public void testReadNonExistentFileIntoByteArray() { + File nonExistentFile = new File("djfhsdkjghdfk"); + Assert.assertFalse(nonExistentFile.exists()); + + IOUtils.readFileIntoByteArray(nonExistentFile); + } + + @Test( expectedExceptions = ReviewedStingException.class ) + public void testReadNullStreamIntoByteArray() { + IOUtils.readStreamIntoByteArray(null); + } + + @Test( expectedExceptions = ReviewedStingException.class ) + public void testReadStreamIntoByteArrayInvalidBufferSize() throws Exception { + IOUtils.readStreamIntoByteArray(new FileInputStream(createTempFile("testReadStreamIntoByteArrayInvalidBufferSize", "tmp")), + -1); + } + + @Test( expectedExceptions = UserException.CouldNotCreateOutputFile.class ) + public void testWriteByteArrayToUncreatableFile() { + IOUtils.writeByteArrayToFile(new byte[]{0}, new File("/dev/foo/bar")); + } + + @Test( expectedExceptions = ReviewedStingException.class ) + public void testWriteNullByteArrayToFile() { + IOUtils.writeByteArrayToFile(null, createTempFile("testWriteNullByteArrayToFile", "tmp")); + } + + @Test( expectedExceptions = ReviewedStingException.class ) + public void testWriteByteArrayToNullStream() { + IOUtils.writeByteArrayToStream(new byte[]{0}, null); + } + + private byte[] getDeterministicRandomData ( int size ) { + GenomeAnalysisEngine.resetRandomGenerator(); + Random rand = GenomeAnalysisEngine.getRandomGenerator(); + + byte[] randomData = new byte[size]; + rand.nextBytes(randomData); + + return randomData; + } } diff --git a/public/keys/GATK_public.key b/public/keys/GATK_public.key new file mode 100644 index 0000000000000000000000000000000000000000..05cdde1c2710421e9c2524eee66f9ea206825091 GIT binary patch literal 294 zcmV+>0ondAf&n5h4F(A+hDe6@4FLfG1potr0S^E$f&mHwf&l>ly(1=10wm#ywI0GW z7t#>rw@^Wh@%LJdafJ!>EoaqBa(Pg_^`O72XYthVZUN1uv@&=X)+9g)nCpEZaBh4O|8L{(%)!MYikf8$T$)3@RaKt;0@_6(dn8yVGeOo;j0s{d60cT=_8~^|S literal 0 HcmV?d00001 diff --git a/public/packages/GATKEngine.xml b/public/packages/GATKEngine.xml index 283b5eabf..68459f6d2 100644 --- a/public/packages/GATKEngine.xml +++ b/public/packages/GATKEngine.xml @@ -36,6 +36,8 @@ + +