From cba89e98ad4879e66fc9bac9d5a102276b0517cb Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Tue, 29 Jan 2013 15:50:46 -0500 Subject: [PATCH 01/11] Refactoring the Bayesian empirical quality estimates to be in a single unit-testable function. --- .../recalibration/BaseRecalibration.java | 163 +++--------------- 1 file changed, 20 insertions(+), 143 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java index 1f4e92ad7..c8d460308 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -57,6 +57,8 @@ import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.io.File; +import java.util.ArrayList; +import java.util.List; /** * Utility methods to facilitate on-the-fly base quality score recalibration. @@ -78,10 +80,6 @@ public class BaseRecalibration { private final double globalQScorePrior; private final boolean emitOriginalQuals; - private final NestedIntegerArray globalDeltaQs; - private final NestedIntegerArray deltaQReporteds; - - /** * Constructor using a GATK Report file * @@ -105,44 +103,6 @@ public class BaseRecalibration { this.preserveQLessThan = preserveQLessThan; this.globalQScorePrior = globalQScorePrior; this.emitOriginalQuals = emitOriginalQuals; - - logger.info("Calculating cached tables..."); - - // - // Create a NestedIntegerArray that maps from rgKey x errorModel -> double, - // where the double is the result of this calculation. The entire calculation can - // be done upfront, on initialization of this BaseRecalibration structure - // - final NestedIntegerArray byReadGroupTable = recalibrationTables.getReadGroupTable(); - globalDeltaQs = new NestedIntegerArray( byReadGroupTable.getDimensions() ); - logger.info("Calculating global delta Q table..."); - for ( NestedIntegerArray.Leaf leaf : byReadGroupTable.getAllLeaves() ) { - final int rgKey = leaf.keys[0]; - final int eventIndex = leaf.keys[1]; - final double globalDeltaQ = calculateGlobalDeltaQ(rgKey, EventType.eventFrom(eventIndex)); - globalDeltaQs.put(globalDeltaQ, rgKey, eventIndex); - } - - - // The calculation of the deltaQ report is constant. key[0] and key[1] are the read group and qual, respectively - // and globalDeltaQ is a constant for the read group. So technically the delta Q reported is simply a lookup - // into a matrix indexed by rgGroup, qual, and event type. - // the code below actually creates this cache with a NestedIntegerArray calling into the actual - // calculateDeltaQReported code. - final NestedIntegerArray byQualTable = recalibrationTables.getQualityScoreTable(); - deltaQReporteds = new NestedIntegerArray( byQualTable.getDimensions() ); - logger.info("Calculating delta Q reported table..."); - for ( NestedIntegerArray.Leaf leaf : byQualTable.getAllLeaves() ) { - final int rgKey = leaf.keys[0]; - final int qual = leaf.keys[1]; - final int eventIndex = leaf.keys[2]; - final EventType event = EventType.eventFrom(eventIndex); - final double globalDeltaQ = getGlobalDeltaQ(rgKey, event); - final double deltaQReported = calculateDeltaQReported(rgKey, qual, event, globalDeltaQ, (byte)qual); - deltaQReporteds.put(deltaQReported, rgKey, qual, eventIndex); - } - - logger.info("done calculating cache"); } /** @@ -189,8 +149,8 @@ public class BaseRecalibration { // the rg key is constant over the whole read, the global deltaQ is too final int rgKey = fullReadKeySet[0][0]; - - final double globalDeltaQ = getGlobalDeltaQ(rgKey, errorModel); + final RecalDatum empiricalQualRG = recalibrationTables.getReadGroupTable().get(rgKey, errorModel.ordinal()); + final double epsilon = ( globalQScorePrior > 0.0 && errorModel.equals(EventType.BASE_SUBSTITUTION) ? globalQScorePrior : empiricalQualRG.getEstimatedQReported() ); for (int offset = 0; offset < readLength; offset++) { // recalibrate all bases in the read final byte origQual = quals[offset]; @@ -199,11 +159,16 @@ public class BaseRecalibration { if ( origQual >= preserveQLessThan ) { // get the keyset for this base using the error model final int[] keySet = fullReadKeySet[offset]; - final double deltaQReported = getDeltaQReported(keySet[0], keySet[1], errorModel, globalDeltaQ); - final double deltaQCovariates = calculateDeltaQCovariates(recalibrationTables, keySet, errorModel, globalDeltaQ, deltaQReported, origQual); + final RecalDatum empiricalQualQS = recalibrationTables.getQualityScoreTable().get(keySet[0], keySet[1], errorModel.ordinal()); + final List empiricalQualCovs = new ArrayList(); + for (int i = 2; i < requestedCovariates.length; i++) { + if (keySet[i] < 0) { + continue; + } + empiricalQualCovs.add(recalibrationTables.getTable(i).get(keySet[0], keySet[1], keySet[i], errorModel.ordinal())); + } - // calculate the recalibrated qual using the BQSR formula - double recalibratedQualDouble = origQual + globalDeltaQ + deltaQReported + deltaQCovariates; + double recalibratedQualDouble = hierarchicalBayesianQualityEstimate( epsilon, empiricalQualRG, empiricalQualQS, empiricalQualCovs ); // recalibrated quality is bound between 1 and MAX_QUAL final byte recalibratedQual = QualityUtils.boundQual(MathUtils.fastRound(recalibratedQualDouble), QualityUtils.MAX_RECALIBRATED_Q_SCORE); @@ -220,102 +185,14 @@ public class BaseRecalibration { } } - private double getGlobalDeltaQ(final int rgKey, final EventType errorModel) { - final Double cached = globalDeltaQs.get(rgKey, errorModel.ordinal()); - - if ( TEST_CACHING ) { - final double calcd = calculateGlobalDeltaQ(rgKey, errorModel); - if ( calcd != cached ) - throw new IllegalStateException("calculated " + calcd + " and cached " + cached + " global delta q not equal at " + rgKey + " / " + errorModel); + protected double hierarchicalBayesianQualityEstimate( final double epsilon, final RecalDatum empiricalQualRG, final RecalDatum empiricalQualQS, final List empiricalQualCovs ) { + final double globalDeltaQ = ( empiricalQualRG == null ? 0.0 : empiricalQualRG.getEmpiricalQuality(epsilon) - epsilon ); + final double deltaQReported = ( empiricalQualQS == null ? 0.0 : empiricalQualQS.getEmpiricalQuality(globalDeltaQ + epsilon) - (globalDeltaQ + epsilon) ); + double deltaQCovariates = 0.0; + for( final RecalDatum empiricalQualCov : empiricalQualCovs ) { + deltaQCovariates += ( empiricalQualCov == null ? 0.0 : empiricalQualCov.getEmpiricalQuality(deltaQReported + globalDeltaQ + epsilon) - (deltaQReported + globalDeltaQ + epsilon) ); } - return cachedWithDefault(cached); - } - - private double getDeltaQReported(final int rgKey, final int qualKey, final EventType errorModel, final double globalDeltaQ) { - final Double cached = deltaQReporteds.get(rgKey, qualKey, errorModel.ordinal()); - - if ( TEST_CACHING ) { - final double calcd = calculateDeltaQReported(rgKey, qualKey, errorModel, globalDeltaQ, (byte)qualKey); - if ( calcd != cached ) - throw new IllegalStateException("calculated " + calcd + " and cached " + cached + " global delta q not equal at " + rgKey + " / " + qualKey + " / " + errorModel); - } - - return cachedWithDefault(cached); - } - - /** - * @param d a Double (that may be null) that is the result of a delta Q calculation - * @return a double == d if d != null, or 0.0 if it is - */ - private double cachedWithDefault(final Double d) { - return d == null ? 0.0 : d; - } - - /** - * Note that this calculation is a constant for each rgKey and errorModel. We need only - * compute this value once for all data. - * - * @param rgKey read group key - * @param errorModel the event type - * @return global delta Q - */ - private double calculateGlobalDeltaQ(final int rgKey, final EventType errorModel) { - double result = 0.0; - - final RecalDatum empiricalQualRG = recalibrationTables.getReadGroupTable().get(rgKey, errorModel.ordinal()); - - if (empiricalQualRG != null) { - final double aggregrateQReported = ( globalQScorePrior > 0.0 && errorModel.equals(EventType.BASE_SUBSTITUTION) ? globalQScorePrior : empiricalQualRG.getEstimatedQReported() ); - final double globalDeltaQEmpirical = empiricalQualRG.getEmpiricalQuality(aggregrateQReported); - result = globalDeltaQEmpirical - aggregrateQReported; - } - - return result; - } - - private double calculateDeltaQReported(final int rgKey, final int qualKey, final EventType errorModel, final double globalDeltaQ, final byte qualFromRead) { - double result = 0.0; - - final RecalDatum empiricalQualQS = recalibrationTables.getQualityScoreTable().get(rgKey, qualKey, errorModel.ordinal()); - if (empiricalQualQS != null) { - final double deltaQReportedEmpirical = empiricalQualQS.getEmpiricalQuality(globalDeltaQ + qualFromRead); - result = deltaQReportedEmpirical - (globalDeltaQ + qualFromRead); - } - - return result; - } - - private double calculateDeltaQCovariates(final RecalibrationTables recalibrationTables, final int[] key, final EventType errorModel, final double globalDeltaQ, final double deltaQReported, final byte qualFromRead) { - double result = 0.0; - - // for all optional covariates - for (int i = 2; i < requestedCovariates.length; i++) { - if (key[i] < 0) - continue; - - result += calculateDeltaQCovariate(recalibrationTables.getTable(i), - key[0], key[1], key[i], errorModel, - globalDeltaQ, deltaQReported, qualFromRead); - } - - return result; - } - - private double calculateDeltaQCovariate(final NestedIntegerArray table, - final int rgKey, - final int qualKey, - final int tableKey, - final EventType errorModel, - final double globalDeltaQ, - final double deltaQReported, - final byte qualFromRead) { - final RecalDatum empiricalQualCO = table.get(rgKey, qualKey, tableKey, errorModel.ordinal()); - if (empiricalQualCO != null) { - final double deltaQCovariateEmpirical = empiricalQualCO.getEmpiricalQuality(deltaQReported + globalDeltaQ + qualFromRead); - return deltaQCovariateEmpirical - (deltaQReported + globalDeltaQ + qualFromRead); - } else { - return 0.0; - } + return epsilon + globalDeltaQ + deltaQReported + deltaQCovariates; } } From 59311aeea21842f7a6c69df1f49230edf90ee98f Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 30 Jan 2013 10:06:14 -0500 Subject: [PATCH 02/11] Getting back null values from the tables is perfectly reasonable if those covariates don't appear in your table. Need to handle them gracefully. --- .../recalibration/BaseRecalibration.java | 49 ++++++++++--------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java index c8d460308..bc7c5a1ce 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -150,33 +150,36 @@ public class BaseRecalibration { // the rg key is constant over the whole read, the global deltaQ is too final int rgKey = fullReadKeySet[0][0]; final RecalDatum empiricalQualRG = recalibrationTables.getReadGroupTable().get(rgKey, errorModel.ordinal()); - final double epsilon = ( globalQScorePrior > 0.0 && errorModel.equals(EventType.BASE_SUBSTITUTION) ? globalQScorePrior : empiricalQualRG.getEstimatedQReported() ); - for (int offset = 0; offset < readLength; offset++) { // recalibrate all bases in the read - final byte origQual = quals[offset]; + if( empiricalQualRG != null ) { + final double epsilon = ( globalQScorePrior > 0.0 && errorModel.equals(EventType.BASE_SUBSTITUTION) ? globalQScorePrior : empiricalQualRG.getEstimatedQReported() ); - // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality) - if ( origQual >= preserveQLessThan ) { - // get the keyset for this base using the error model - final int[] keySet = fullReadKeySet[offset]; - final RecalDatum empiricalQualQS = recalibrationTables.getQualityScoreTable().get(keySet[0], keySet[1], errorModel.ordinal()); - final List empiricalQualCovs = new ArrayList(); - for (int i = 2; i < requestedCovariates.length; i++) { - if (keySet[i] < 0) { - continue; + for (int offset = 0; offset < readLength; offset++) { // recalibrate all bases in the read + final byte origQual = quals[offset]; + + // only recalibrate usable qualities (the original quality will come from the instrument -- reported quality) + if ( origQual >= preserveQLessThan ) { + // get the keyset for this base using the error model + final int[] keySet = fullReadKeySet[offset]; + final RecalDatum empiricalQualQS = recalibrationTables.getQualityScoreTable().get(keySet[0], keySet[1], errorModel.ordinal()); + final List empiricalQualCovs = new ArrayList(); + for (int i = 2; i < requestedCovariates.length; i++) { + if (keySet[i] < 0) { + continue; + } + empiricalQualCovs.add(recalibrationTables.getTable(i).get(keySet[0], keySet[1], keySet[i], errorModel.ordinal())); } - empiricalQualCovs.add(recalibrationTables.getTable(i).get(keySet[0], keySet[1], keySet[i], errorModel.ordinal())); + + double recalibratedQualDouble = hierarchicalBayesianQualityEstimate( epsilon, empiricalQualRG, empiricalQualQS, empiricalQualCovs ); + + // recalibrated quality is bound between 1 and MAX_QUAL + final byte recalibratedQual = QualityUtils.boundQual(MathUtils.fastRound(recalibratedQualDouble), QualityUtils.MAX_RECALIBRATED_Q_SCORE); + + // return the quantized version of the recalibrated quality + final byte recalibratedQualityScore = quantizationInfo.getQuantizedQuals().get(recalibratedQual); + + quals[offset] = recalibratedQualityScore; } - - double recalibratedQualDouble = hierarchicalBayesianQualityEstimate( epsilon, empiricalQualRG, empiricalQualQS, empiricalQualCovs ); - - // recalibrated quality is bound between 1 and MAX_QUAL - final byte recalibratedQual = QualityUtils.boundQual(MathUtils.fastRound(recalibratedQualDouble), QualityUtils.MAX_RECALIBRATED_Q_SCORE); - - // return the quantized version of the recalibrated quality - final byte recalibratedQualityScore = quantizationInfo.getQuantizedQuals().get(recalibratedQual); - - quals[offset] = recalibratedQualityScore; } } From 2967776458e736175636d8321a3232f02634b7db Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 30 Jan 2013 12:28:14 -0500 Subject: [PATCH 03/11] The Empirical quality column in the recalibration report can't be compared in the BQSRGatherer because the value is calculated using the Bayesian estimate with different priors. This value should never be used from a recalibration report anyway except during plotting. --- .../recalibration/BaseRecalibration.java | 2 +- .../sting/utils/recalibration/RecalDatum.java | 1 + .../recalibration/RecalibrationReport.java | 2 +- .../walkers/bqsr/BQSRGathererUnitTest.java | 6 ++-- .../recalibration/RecalDatumUnitTest.java | 33 +++++++++++++++++++ 5 files changed, 39 insertions(+), 5 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java index bc7c5a1ce..f3d4a3096 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -188,7 +188,7 @@ public class BaseRecalibration { } } - protected double hierarchicalBayesianQualityEstimate( final double epsilon, final RecalDatum empiricalQualRG, final RecalDatum empiricalQualQS, final List empiricalQualCovs ) { + protected static double hierarchicalBayesianQualityEstimate( final double epsilon, final RecalDatum empiricalQualRG, final RecalDatum empiricalQualQS, final List empiricalQualCovs ) { final double globalDeltaQ = ( empiricalQualRG == null ? 0.0 : empiricalQualRG.getEmpiricalQuality(epsilon) - epsilon ); final double deltaQReported = ( empiricalQualQS == null ? 0.0 : empiricalQualQS.getEmpiricalQuality(globalDeltaQ + epsilon) - (globalDeltaQ + epsilon) ); double deltaQCovariates = 0.0; diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java index 0b4441439..bd0063a7d 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java @@ -181,6 +181,7 @@ public class RecalDatum { if ( Double.isNaN(estimatedQReported) ) throw new IllegalArgumentException("estimatedQReported is NaN"); this.estimatedQReported = estimatedQReported; + empiricalQuality = UNINITIALIZED; } public final double getEstimatedQReported() { diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java index 2f9a38972..f10c26ddc 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java @@ -131,7 +131,7 @@ public class RecalibrationReport { * Combines two recalibration reports by adding all observations and errors * * Note: This method DOES NOT recalculate the empirical qualities and quantized qualities. You have to recalculate - * them after combining. The reason for not calculating it is because this function is inteded for combining a + * them after combining. The reason for not calculating it is because this function is intended for combining a * series of recalibration reports, and it only makes sense to calculate the empirical qualities and quantized * qualities after all the recalibration reports have been combined. Having the user recalculate when appropriate, * makes this method faster diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java index 2815599d9..f82f24439 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java @@ -125,19 +125,19 @@ public class BQSRGathererUnitTest extends BaseTest { testTablesWithColumns(originalTable, calculatedTable, columnsToTest); // test the RecalTable0 table - columnsToTest = Arrays.asList(RecalUtils.READGROUP_COLUMN_NAME, RecalUtils.EVENT_TYPE_COLUMN_NAME, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME, RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, RecalUtils.NUMBER_ERRORS_COLUMN_NAME); + columnsToTest = Arrays.asList(RecalUtils.READGROUP_COLUMN_NAME, RecalUtils.EVENT_TYPE_COLUMN_NAME, RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, RecalUtils.NUMBER_ERRORS_COLUMN_NAME); originalTable = originalReport.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE); calculatedTable = calculatedReport.getTable(RecalUtils.READGROUP_REPORT_TABLE_TITLE); testTablesWithColumns(originalTable, calculatedTable, columnsToTest); // test the RecalTable1 table - columnsToTest = Arrays.asList(RecalUtils.READGROUP_COLUMN_NAME, RecalUtils.QUALITY_SCORE_COLUMN_NAME, RecalUtils.EVENT_TYPE_COLUMN_NAME, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, RecalUtils.NUMBER_ERRORS_COLUMN_NAME); + columnsToTest = Arrays.asList(RecalUtils.READGROUP_COLUMN_NAME, RecalUtils.QUALITY_SCORE_COLUMN_NAME, RecalUtils.EVENT_TYPE_COLUMN_NAME, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, RecalUtils.NUMBER_ERRORS_COLUMN_NAME); originalTable = originalReport.getTable(RecalUtils.QUALITY_SCORE_REPORT_TABLE_TITLE); calculatedTable = calculatedReport.getTable(RecalUtils.QUALITY_SCORE_REPORT_TABLE_TITLE); testTablesWithColumns(originalTable, calculatedTable, columnsToTest); // test the RecalTable2 table - columnsToTest = Arrays.asList(RecalUtils.READGROUP_COLUMN_NAME, RecalUtils.QUALITY_SCORE_COLUMN_NAME, RecalUtils.COVARIATE_VALUE_COLUMN_NAME, RecalUtils.COVARIATE_NAME_COLUMN_NAME, RecalUtils.EVENT_TYPE_COLUMN_NAME, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, RecalUtils.NUMBER_ERRORS_COLUMN_NAME); + columnsToTest = Arrays.asList(RecalUtils.READGROUP_COLUMN_NAME, RecalUtils.QUALITY_SCORE_COLUMN_NAME, RecalUtils.COVARIATE_VALUE_COLUMN_NAME, RecalUtils.COVARIATE_NAME_COLUMN_NAME, RecalUtils.EVENT_TYPE_COLUMN_NAME, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, RecalUtils.NUMBER_ERRORS_COLUMN_NAME); originalTable = originalReport.getTable(RecalUtils.ALL_COVARIATES_REPORT_TABLE_TITLE); calculatedTable = calculatedReport.getTable(RecalUtils.ALL_COVARIATES_REPORT_TABLE_TITLE); testTablesWithColumns(originalTable, calculatedTable, columnsToTest); diff --git a/protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java index 09f751fbc..37f0fcc0c 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java @@ -58,6 +58,7 @@ import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.util.Arrays; +import java.util.Collections; public class RecalDatumUnitTest extends BaseTest { @@ -277,4 +278,36 @@ public class RecalDatumUnitTest extends BaseTest { Assert.assertFalse(Double.isInfinite(log10likelihood)); Assert.assertFalse(Double.isNaN(log10likelihood)); } + + @Test + public void basicHierarchicalBayesianQualityEstimateTest() { + { + double RG_Q = 45.0; + RecalDatum RG = new RecalDatum( (long)10000000, (long) (10000000 * 1.0 / (Math.pow(10.0, RG_Q/10.0))), (byte)RG_Q); + double Q = 30.0; + RecalDatum QS = new RecalDatum( (long)10000000, (long) (10000000 * 1.0 / (Math.pow(10.0, Q/10.0))), (byte)Q); + RecalDatum COV = new RecalDatum( (long)150, (long) 1, (byte)45.0); // no data here so Bayesian prior has a huge effect on the empirical quality + + Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 45.0, RG, QS, Collections.singletonList(COV)), Q, 1E-4 ); + // initial epsilon condition shouldn't matter when there are a lot of observations + Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 25.0, RG, QS, Collections.singletonList(COV)), Q, 1E-4 ); + Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 5.0, RG, QS, Collections.singletonList(COV)), Q, 1E-4 ); + Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 65.0, RG, QS, Collections.singletonList(COV)), Q, 1E-4 ); + } + + { + double RG_Q = 45.0; + RecalDatum RG = new RecalDatum( (long)100, (long) (100 * 1.0 / (Math.pow(10.0, RG_Q/10.0))), (byte)RG_Q); + double Q = 30.0; + RecalDatum QS = new RecalDatum( (long)100, (long) (100 * 1.0 / (Math.pow(10.0, Q/10.0))), (byte)Q); + RecalDatum COV = new RecalDatum( (long)150, (long) 1, (byte)45.0); // no data here so Bayesian prior has a huge effect on the empirical quality + + // initial epsilon condition dominates when there is no data + Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 45.0, RG, QS, Collections.singletonList(COV)), 45.0, 1E-4 ); + Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 25.0, RG, QS, Collections.singletonList(COV)), 25.0, 1E-4 ); + Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 5.0, RG, QS, Collections.singletonList(COV)), 5.0, 1E-4 ); + Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 65.0, RG, QS, Collections.singletonList(COV)), 65.0, 1E-4 ); + } + + } } \ No newline at end of file From 9985f82a7af60dbbdf0c1343a02bd6b10ada218c Mon Sep 17 00:00:00 2001 From: David Roazen Date: Wed, 30 Jan 2013 12:54:57 -0500 Subject: [PATCH 05/11] Move BaseUtils back to the GATK by request, along with associated utility methods --- .../gatk/walkers/annotator/GCContent.java | 2 +- .../walkers/annotator/HaplotypeScore.java | 2 +- .../gatk/walkers/bqsr/BaseRecalibrator.java | 2 +- .../DiploidSNPGenotypeLikelihoods.java | 2 +- .../GeneralPloidySNPGenotypeLikelihoods.java | 3 +- ...NPGenotypeLikelihoodsCalculationModel.java | 2 +- .../GenotypeLikelihoodsCalculationModel.java | 2 +- ...elGenotypeLikelihoodsCalculationModel.java | 2 +- ...NPGenotypeLikelihoodsCalculationModel.java | 2 +- .../genotyper/UnifiedGenotyperEngine.java | 2 +- .../haplotypecaller/GenotypingEngine.java | 2 +- .../gatk/walkers/indels/IndelRealigner.java | 2 +- .../sting/gatk/walkers/phasing/BaseArray.java | 2 +- .../gatk/walkers/phasing/PhasingRead.java | 2 +- .../walkers/phasing/ReadBackedPhasing.java | 2 +- .../gatk/walkers/phasing/SNPallelePair.java | 2 +- .../VariantRecalibrator.java | 4 +- .../sting/utils/recalibration/RecalUtils.java | 2 +- .../covariates/ContextCovariate.java | 2 +- .../covariates/CycleCovariate.java | 2 +- .../covariates/RepeatCovariate.java | 2 +- .../ArtificialReadPileupTestProvider.java | 2 +- ...eralPloidyGenotypeLikelihoodsUnitTest.java | 2 +- .../ConcordanceMetricsUnitTest.java | 2 +- .../sting/utils/pairhmm/PairHMMUnitTest.java | 2 +- .../RepeatCovariatesUnitTest.java | 2 +- .../sting/alignment/Alignment.java | 2 +- .../sting/alignment/AlignmentValidation.java | 2 +- .../bwa/java/AlignerTestHarness.java | 2 +- .../alignment/bwa/java/BWAJavaAligner.java | 2 +- .../sting/gatk/contexts/ReferenceContext.java | 2 +- .../AlleleBiasedDownsamplingUtils.java | 2 +- .../gatk/walkers/annotator/BaseCounts.java | 2 +- .../gatk/walkers/annotator/NBaseCount.java | 2 +- .../walkers/annotator/VariantAnnotator.java | 2 +- .../gatk/walkers/coverage/CallableLoci.java | 2 +- .../gatk/walkers/coverage/CoverageUtils.java | 2 +- .../walkers/coverage/DepthOfCoverage.java | 2 +- .../coverage/DepthOfCoverageStats.java | 2 +- .../walkers/coverage/GCContentByInterval.java | 2 +- .../diagnostics/ErrorRatePerCycle.java | 2 +- .../sting/gatk/walkers/fasta/FastaStats.java | 2 +- .../sting/gatk/walkers/qc/QCRef.java | 2 +- .../gatk/walkers/readutils/ClipReads.java | 2 +- .../validation/ValidationAmplicons.java | 2 +- .../evaluators/MultiallelicSummary.java | 4 +- .../evaluators/TiTvVariantEvaluator.java | 6 +- .../evaluators/VariantSummary.java | 4 +- .../variantutils/LiftoverVariants.java | 6 +- .../walkers/variantutils/VariantsToTable.java | 3 +- .../walkers/variantutils/VariantsToVCF.java | 2 +- .../{variant => sting}/utils/BaseUtils.java | 17 +++-- .../sting/utils/duplicates/DupUtils.java | 2 +- .../CachingIndexedFastaSequenceFile.java | 2 +- .../utils/genotyper/DiploidGenotype.java | 2 +- .../sting/utils/pileup/PileupElement.java | 2 +- .../utils/pileup/ReadBackedPileupImpl.java | 2 +- .../sting/utils/sam/AlignmentUtils.java | 2 +- .../sting/utils/sam/ReadUtils.java | 2 +- .../variant/GATKVariantContextUtils.java | 67 +++++++++++++++++-- .../variant/variantcontext/Allele.java | 4 +- .../variantcontext/VariantContextUtils.java | 63 ----------------- .../utils/BaseUtilsUnitTest.java | 14 ++-- ...chingIndexedFastaSequenceFileUnitTest.java | 2 +- .../sting/utils/sam/ReadUtilsUnitTest.java | 2 +- .../GenotypeLikelihoodsUnitTest.java | 7 +- 66 files changed, 153 insertions(+), 155 deletions(-) rename public/java/src/org/broadinstitute/{variant => sting}/utils/BaseUtils.java (95%) rename public/java/test/org/broadinstitute/{variant => sting}/utils/BaseUtilsUnitTest.java (93%) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java index 2b3290595..93bdf8c9d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/GCContent.java @@ -54,7 +54,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.variant.utils.BaseUtils; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.variant.vcf.VCFHeaderLineType; import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java index 3acba48ae..13969eb54 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/annotator/HaplotypeScore.java @@ -55,7 +55,7 @@ import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompa import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.StandardAnnotation; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; -import org.broadinstitute.variant.utils.BaseUtils; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.variant.vcf.VCFHeaderLineType; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index 2df5fefa8..354e508c2 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -60,7 +60,7 @@ import org.broadinstitute.sting.gatk.iterators.ReadTransformer; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.variant.utils.BaseUtils; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.clipping.ReadClipper; import org.broadinstitute.sting.utils.collections.Pair; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java index 8c8de2bad..2baa89999 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/DiploidSNPGenotypeLikelihoods.java @@ -47,7 +47,7 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import net.sf.samtools.SAMUtils; -import org.broadinstitute.variant.utils.BaseUtils; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.exceptions.UserException; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java index aa117eb3b..14bffbc34 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoods.java @@ -49,9 +49,8 @@ package org.broadinstitute.sting.gatk.walkers.genotyper; import net.sf.samtools.SAMUtils; import org.broadinstitute.sting.gatk.walkers.genotyper.afcalc.ExactACset; -import org.broadinstitute.variant.utils.BaseUtils; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.PileupElement; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java index 9aa8c13ec..9ea027698 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GeneralPloidySNPGenotypeLikelihoodsCalculationModel.java @@ -77,7 +77,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.utils.*; -import org.broadinstitute.variant.utils.BaseUtils; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.variant.variantcontext.*; import java.util.*; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java index b3740bbb7..f48ae81cf 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/GenotypeLikelihoodsCalculationModel.java @@ -51,7 +51,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.variant.utils.BaseUtils; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java index 84c109c9d..5a1bdf9e5 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/IndelGenotypeLikelihoodsCalculationModel.java @@ -52,7 +52,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.indels.PairHMMIndelErrorModel; -import org.broadinstitute.variant.utils.BaseUtils; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.Haplotype; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java index 7dc3e8ee3..0652cc236 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/SNPGenotypeLikelihoodsCalculationModel.java @@ -51,7 +51,7 @@ import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.variant.utils.BaseUtils; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.MathUtils; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java index 8f6097661..19d218023 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperEngine.java @@ -62,7 +62,7 @@ import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.utils.BaseUtils; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.variant.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.pileup.PileupElement; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index 9aeffe966..d254f5b8b 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -58,7 +58,7 @@ import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.utils.BaseUtils; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.variant.variantcontext.*; import java.io.PrintStream; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java index 851703648..ad554a130 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/indels/IndelRealigner.java @@ -75,7 +75,7 @@ import org.broadinstitute.sting.utils.sam.NWaySAMFileWriter; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.text.TextFormattingUtils; import org.broadinstitute.sting.utils.text.XReadLines; -import org.broadinstitute.variant.utils.BaseUtils; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.variant.variantcontext.VariantContext; import java.io.File; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/BaseArray.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/BaseArray.java index de91765b7..cbc6a1f94 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/BaseArray.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/BaseArray.java @@ -46,7 +46,7 @@ package org.broadinstitute.sting.gatk.walkers.phasing; -import org.broadinstitute.variant.utils.BaseUtils; +import org.broadinstitute.sting.utils.BaseUtils; import java.util.Arrays; import java.util.LinkedList; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java index f2ba027f8..a04789f61 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/PhasingRead.java @@ -46,7 +46,7 @@ package org.broadinstitute.sting.gatk.walkers.phasing; -import org.broadinstitute.variant.utils.BaseUtils; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java index 56fc59c7f..980894112 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java @@ -59,7 +59,7 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.variant.GATKVCFUtils; import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; -import org.broadinstitute.variant.utils.BaseUtils; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.HasGenomeLocation; import org.broadinstitute.sting.utils.SampleUtils; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/SNPallelePair.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/SNPallelePair.java index 0a4783214..8dca985c8 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/SNPallelePair.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/SNPallelePair.java @@ -46,7 +46,7 @@ package org.broadinstitute.sting.gatk.walkers.phasing; -import org.broadinstitute.variant.utils.BaseUtils; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.variant.variantcontext.Allele; import org.broadinstitute.variant.variantcontext.Genotype; diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java index 433900d02..d8d79e26c 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantRecalibrator.java @@ -59,6 +59,7 @@ import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; import org.broadinstitute.sting.utils.R.RScriptExecutor; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.variant.GATKVariantContextUtils; import org.broadinstitute.variant.vcf.VCFHeader; import org.broadinstitute.variant.vcf.VCFHeaderLine; import org.broadinstitute.sting.utils.collections.ExpandingArrayList; @@ -66,7 +67,6 @@ import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; import org.broadinstitute.sting.utils.io.Resource; import org.broadinstitute.variant.variantcontext.VariantContext; -import org.broadinstitute.variant.variantcontext.VariantContextUtils; import org.broadinstitute.variant.variantcontext.writer.VariantContextWriter; import java.io.File; @@ -274,7 +274,7 @@ public class VariantRecalibrator extends RodWalker { if ( toInterval != null ) { // check whether the strand flips, and if so reverse complement everything if ( fromInterval.isPositiveStrand() != toInterval.isPositiveStrand() && vc.isPointEvent() ) { - vc = VariantContextUtils.reverseComplement(vc); + vc = GATKVariantContextUtils.reverseComplement(vc); } vc = new VariantContextBuilder(vc).loc(toInterval.getSequence(), toInterval.getStart(), toInterval.getStart() + length).make(); @@ -133,7 +133,7 @@ public class LiftoverVariants extends RodWalker { .attribute("OriginalStart", fromInterval.getStart()).make(); } - if ( originalVC.isSNP() && originalVC.isBiallelic() && VariantContextUtils.getSNPSubstitutionType(originalVC) != VariantContextUtils.getSNPSubstitutionType(vc) ) { + if ( originalVC.isSNP() && originalVC.isBiallelic() && GATKVariantContextUtils.getSNPSubstitutionType(originalVC) != GATKVariantContextUtils.getSNPSubstitutionType(vc) ) { logger.warn(String.format("VCF at %s / %d => %s / %d is switching substitution type %s/%s to %s/%s", originalVC.getChr(), originalVC.getStart(), vc.getChr(), vc.getStart(), originalVC.getReference(), originalVC.getAlternateAllele(0), vc.getReference(), vc.getAlternateAllele(0))); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java index 1ea85df47..f6c02592d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java @@ -41,7 +41,6 @@ import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.variant.variantcontext.VariantContextUtils; import java.io.PrintStream; import java.lang.reflect.Array; @@ -432,7 +431,7 @@ public class VariantsToTable extends RodWalker { getters.put("QUAL", new Getter() { public String get(VariantContext vc) { return Double.toString(vc.getPhredScaledQual()); } }); getters.put("TRANSITION", new Getter() { public String get(VariantContext vc) { if ( vc.isSNP() && vc.isBiallelic() ) - return VariantContextUtils.isTransition(vc) ? "1" : "0"; + return GATKVariantContextUtils.isTransition(vc) ? "1" : "0"; else return "-1"; }}); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java index 5afeccffe..29e7ad8d7 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToVCF.java @@ -39,7 +39,7 @@ import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.walkers.Reference; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.Window; -import org.broadinstitute.variant.utils.BaseUtils; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.codecs.hapmap.RawHapMapFeature; diff --git a/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java similarity index 95% rename from public/java/src/org/broadinstitute/variant/utils/BaseUtils.java rename to public/java/src/org/broadinstitute/sting/utils/BaseUtils.java index 76b49edb9..e6e1db813 100644 --- a/public/java/src/org/broadinstitute/variant/utils/BaseUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/BaseUtils.java @@ -23,12 +23,14 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.variant.utils; +package org.broadinstitute.sting.utils; import net.sf.samtools.util.StringUtil; +import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; +import org.broadinstitute.sting.utils.exceptions.UserException; import java.util.Arrays; -import java.util.Random; /** * BaseUtils contains some basic utilities for manipulating nucleotides. @@ -95,9 +97,6 @@ public class BaseUtils { baseIndexWithIupacMap['v'] = Base.N.ordinal(); } - // Use a fixed random seed to allow for deterministic results when using random bases - private static final Random randomNumberGen = new Random(47382911L); - /// In genetics, a transition is a mutation changing a purine to another purine nucleotide (A <-> G) or // a pyrimidine to another pyrimidine nucleotide (C <-> T). // Approximately two out of every three single nucleotide polymorphisms (SNPs) are transitions. @@ -174,7 +173,7 @@ public class BaseUtils { if ( baseIndex == Base.N.ordinal() ) { bases[i] = 'N'; } else if ( errorOnBadReferenceBase && baseIndex == -1 ) { - throw new IllegalStateException("We encountered a non-standard non-IUPAC base in the provided reference: '" + bases[i] + "'"); + throw new UserException.BadInput("We encountered a non-standard non-IUPAC base in the provided reference: '" + bases[i] + "'"); } } return bases; @@ -251,7 +250,7 @@ public class BaseUtils { */ static public int simpleBaseToBaseIndex(final byte base) { if ( base < 0 || base >= 256 ) - throw new IllegalArgumentException("Non-standard bases were encountered in either the input reference or BAM file(s)"); + throw new UserException.BadInput("Non-standard bases were encountered in either the input reference or BAM file(s)"); return baseIndexMap[base]; } @@ -491,7 +490,7 @@ public class BaseUtils { int randomBaseIndex = excludeBaseIndex; while (randomBaseIndex == excludeBaseIndex) { - randomBaseIndex = randomNumberGen.nextInt(4); + randomBaseIndex = GenomeAnalysisEngine.getRandomGenerator().nextInt(4); } return randomBaseIndex; @@ -515,7 +514,7 @@ public class BaseUtils { case 'N': return 'N'; default: - throw new IllegalArgumentException("base must be A, C, G or T. " + (char) base + " is not a valid base."); + throw new ReviewedStingException("base must be A, C, G or T. " + (char) base + " is not a valid base."); } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java b/public/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java index 39f5b06c6..c78294505 100644 --- a/public/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java @@ -25,7 +25,7 @@ package org.broadinstitute.sting.utils.duplicates; -import org.broadinstitute.variant.utils.BaseUtils; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; import org.broadinstitute.sting.utils.QualityUtils; diff --git a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java index a749625cd..c30ac4f7f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java +++ b/public/java/src/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFile.java @@ -33,7 +33,7 @@ import net.sf.samtools.SAMSequenceRecord; import net.sf.samtools.util.StringUtil; import org.apache.log4j.Priority; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.variant.utils.BaseUtils; +import org.broadinstitute.sting.utils.BaseUtils; import java.io.File; import java.io.FileNotFoundException; diff --git a/public/java/src/org/broadinstitute/sting/utils/genotyper/DiploidGenotype.java b/public/java/src/org/broadinstitute/sting/utils/genotyper/DiploidGenotype.java index febc62716..ceae4bb47 100644 --- a/public/java/src/org/broadinstitute/sting/utils/genotyper/DiploidGenotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/genotyper/DiploidGenotype.java @@ -25,7 +25,7 @@ package org.broadinstitute.sting.utils.genotyper; -import org.broadinstitute.variant.utils.BaseUtils; +import org.broadinstitute.sting.utils.BaseUtils; public enum DiploidGenotype { AA ('A', 'A'), diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java index 5a5358208..51753ca5e 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/PileupElement.java @@ -29,7 +29,7 @@ import com.google.java.contract.Ensures; import com.google.java.contract.Requires; import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; -import org.broadinstitute.variant.utils.BaseUtils; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; diff --git a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java index fe43f85bd..65c47c23b 100644 --- a/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java +++ b/public/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileupImpl.java @@ -32,7 +32,7 @@ import org.broadinstitute.sting.utils.fragments.FragmentCollection; import org.broadinstitute.sting.utils.fragments.FragmentUtils; import org.broadinstitute.sting.utils.locusiterator.LocusIteratorByState; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import org.broadinstitute.variant.utils.BaseUtils; +import org.broadinstitute.sting.utils.BaseUtils; import java.util.*; diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java index b7a813ec2..9c5f95472 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/AlignmentUtils.java @@ -30,7 +30,7 @@ import net.sf.samtools.CigarElement; import net.sf.samtools.CigarOperator; import net.sf.samtools.SAMRecord; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.variant.utils.BaseUtils; +import org.broadinstitute.sting.utils.BaseUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; diff --git a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java index 29f8c8dcd..39d058aea 100644 --- a/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/sam/ReadUtils.java @@ -34,7 +34,7 @@ import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.NGSPlatform; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.variant.utils.BaseUtils; +import org.broadinstitute.sting.utils.BaseUtils; import java.io.File; import java.util.*; diff --git a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java index 2ae289214..f22066f8e 100644 --- a/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/variant/GATKVariantContextUtils.java @@ -30,10 +30,7 @@ import org.apache.commons.lang.ArrayUtils; import org.apache.log4j.Logger; import org.broad.tribble.TribbleException; import org.broad.tribble.util.popgen.HardyWeinbergCalculation; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; -import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.*; import org.broadinstitute.sting.utils.collections.Pair; import org.broadinstitute.variant.variantcontext.*; import org.broadinstitute.variant.vcf.VCFConstants; @@ -108,6 +105,68 @@ public class GATKVariantContextUtils { return genomeLocParser.createGenomeLoc(vc.getChr(), vc.getStart(), vc.getEnd(), true); } + public static BaseUtils.BaseSubstitutionType getSNPSubstitutionType(VariantContext context) { + if (!context.isSNP() || !context.isBiallelic()) + throw new IllegalStateException("Requested SNP substitution type for bialleic non-SNP " + context); + return BaseUtils.SNPSubstitutionType(context.getReference().getBases()[0], context.getAlternateAllele(0).getBases()[0]); + } + + /** + * If this is a BiAlleic SNP, is it a transition? + */ + public static boolean isTransition(VariantContext context) { + return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSITION; + } + + /** + * If this is a BiAlleic SNP, is it a transversion? + */ + public static boolean isTransversion(VariantContext context) { + return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSVERSION; + } + + public static boolean isTransition(Allele ref, Allele alt) { + return BaseUtils.SNPSubstitutionType(ref.getBases()[0], alt.getBases()[0]) == BaseUtils.BaseSubstitutionType.TRANSITION; + } + + public static boolean isTransversion(Allele ref, Allele alt) { + return BaseUtils.SNPSubstitutionType(ref.getBases()[0], alt.getBases()[0]) == BaseUtils.BaseSubstitutionType.TRANSVERSION; + } + + /** + * Returns a context identical to this with the REF and ALT alleles reverse complemented. + * + * @param vc variant context + * @return new vc + */ + public static VariantContext reverseComplement(VariantContext vc) { + // create a mapping from original allele to reverse complemented allele + HashMap alleleMap = new HashMap(vc.getAlleles().size()); + for ( Allele originalAllele : vc.getAlleles() ) { + Allele newAllele; + if ( originalAllele.isNoCall() ) + newAllele = originalAllele; + else + newAllele = Allele.create(BaseUtils.simpleReverseComplement(originalAllele.getBases()), originalAllele.isReference()); + alleleMap.put(originalAllele, newAllele); + } + + // create new Genotype objects + GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); + for ( final Genotype genotype : vc.getGenotypes() ) { + List newAlleles = new ArrayList(); + for ( Allele allele : genotype.getAlleles() ) { + Allele newAllele = alleleMap.get(allele); + if ( newAllele == null ) + newAllele = Allele.NO_CALL; + newAlleles.add(newAllele); + } + newGenotypes.add(new GenotypeBuilder(genotype).alleles(newAlleles).make()); + } + + return new VariantContextBuilder(vc).alleles(alleleMap.values()).genotypes(newGenotypes).make(); + } + /** * Returns true iff VC is an non-complex indel where every allele represents an expansion or * contraction of a series of identical bases in the reference. diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/Allele.java b/public/java/src/org/broadinstitute/variant/variantcontext/Allele.java index 0a0b4d0b7..e0a6495a5 100644 --- a/public/java/src/org/broadinstitute/variant/variantcontext/Allele.java +++ b/public/java/src/org/broadinstitute/variant/variantcontext/Allele.java @@ -25,7 +25,7 @@ package org.broadinstitute.variant.variantcontext; -import org.broadinstitute.variant.utils.BaseUtils; +import net.sf.samtools.util.StringUtil; import java.util.Arrays; import java.util.Collection; @@ -130,7 +130,7 @@ public class Allele implements Comparable { if ( isRef ) throw new IllegalArgumentException("Cannot tag a symbolic allele as the reference allele"); } else { - BaseUtils.convertToUpperCase(bases); + StringUtil.toUpperCase(bases); } this.isRef = isRef; diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/variant/variantcontext/VariantContextUtils.java index fa2b5c9e5..11d84e318 100644 --- a/public/java/src/org/broadinstitute/variant/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/variant/variantcontext/VariantContextUtils.java @@ -30,7 +30,6 @@ import com.google.java.contract.Requires; import org.apache.commons.jexl2.Expression; import org.apache.commons.jexl2.JexlEngine; import org.broad.tribble.TribbleException; -import org.broadinstitute.variant.utils.BaseUtils; import org.broadinstitute.variant.utils.GeneralUtils; import org.broadinstitute.variant.vcf.*; @@ -432,40 +431,6 @@ public class VariantContextUtils { // } - /** - * Returns a context identical to this with the REF and ALT alleles reverse complemented. - * - * @param vc variant context - * @return new vc - */ - public static VariantContext reverseComplement(VariantContext vc) { - // create a mapping from original allele to reverse complemented allele - HashMap alleleMap = new HashMap(vc.getAlleles().size()); - for ( Allele originalAllele : vc.getAlleles() ) { - Allele newAllele; - if ( originalAllele.isNoCall() ) - newAllele = originalAllele; - else - newAllele = Allele.create(BaseUtils.simpleReverseComplement(originalAllele.getBases()), originalAllele.isReference()); - alleleMap.put(originalAllele, newAllele); - } - - // create new Genotype objects - GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); - for ( final Genotype genotype : vc.getGenotypes() ) { - List newAlleles = new ArrayList(); - for ( Allele allele : genotype.getAlleles() ) { - Allele newAllele = alleleMap.get(allele); - if ( newAllele == null ) - newAllele = Allele.NO_CALL; - newAlleles.add(newAllele); - } - newGenotypes.add(new GenotypeBuilder(genotype).alleles(newAlleles).make()); - } - - return new VariantContextBuilder(vc).alleles(alleleMap.values()).genotypes(newGenotypes).make(); - } - public static VariantContext purgeUnallowedGenotypeAttributes(VariantContext vc, Set allowedAttributes) { if ( allowedAttributes == null ) return vc; @@ -483,34 +448,6 @@ public class VariantContextUtils { return new VariantContextBuilder(vc).genotypes(newGenotypes).make(); } - public static BaseUtils.BaseSubstitutionType getSNPSubstitutionType(VariantContext context) { - if (!context.isSNP() || !context.isBiallelic()) - throw new IllegalStateException("Requested SNP substitution type for bialleic non-SNP " + context); - return BaseUtils.SNPSubstitutionType(context.getReference().getBases()[0], context.getAlternateAllele(0).getBases()[0]); - } - - /** - * If this is a BiAlleic SNP, is it a transition? - */ - public static boolean isTransition(VariantContext context) { - return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSITION; - } - - /** - * If this is a BiAlleic SNP, is it a transversion? - */ - public static boolean isTransversion(VariantContext context) { - return getSNPSubstitutionType(context) == BaseUtils.BaseSubstitutionType.TRANSVERSION; - } - - public static boolean isTransition(Allele ref, Allele alt) { - return BaseUtils.SNPSubstitutionType(ref.getBases()[0], alt.getBases()[0]) == BaseUtils.BaseSubstitutionType.TRANSITION; - } - - public static boolean isTransversion(Allele ref, Allele alt) { - return BaseUtils.SNPSubstitutionType(ref.getBases()[0], alt.getBases()[0]) == BaseUtils.BaseSubstitutionType.TRANSVERSION; - } - public static int getSize( VariantContext vc ) { return vc.getEnd() - vc.getStart() + 1; } diff --git a/public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/BaseUtilsUnitTest.java similarity index 93% rename from public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java rename to public/java/test/org/broadinstitute/sting/utils/BaseUtilsUnitTest.java index 738fa28e2..cb80dbc09 100644 --- a/public/java/test/org/broadinstitute/variant/utils/BaseUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/BaseUtilsUnitTest.java @@ -23,20 +23,22 @@ * THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -package org.broadinstitute.variant.utils; +package org.broadinstitute.sting.utils; -import org.broadinstitute.variant.VariantBaseTest; +import org.broadinstitute.sting.BaseTest; import org.testng.Assert; import org.testng.annotations.Test; import org.testng.annotations.BeforeClass; -public class BaseUtilsUnitTest extends VariantBaseTest { +public class BaseUtilsUnitTest extends BaseTest { @BeforeClass public void init() { } @Test public void testMostFrequentBaseFraction() { + logger.warn("Executing testMostFrequentBaseFraction"); + compareFrequentBaseFractionToExpected("AAAAA", 1.0); compareFrequentBaseFractionToExpected("ACCG", 0.5); compareFrequentBaseFractionToExpected("ACCCCTTTTG", 4.0/10.0); @@ -44,7 +46,7 @@ public class BaseUtilsUnitTest extends VariantBaseTest { private void compareFrequentBaseFractionToExpected(String sequence, double expected) { double fraction = BaseUtils.mostFrequentBaseFraction(sequence.getBytes()); - Assert.assertTrue(GeneralUtils.compareDoubles(fraction, expected) == 0); + Assert.assertTrue(MathUtils.compareDoubles(fraction, expected) == 0); } @Test @@ -64,6 +66,8 @@ public class BaseUtilsUnitTest extends VariantBaseTest { @Test public void testTransitionTransversion() { + logger.warn("Executing testTransitionTransversion"); + Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'A', (byte)'T' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'A', (byte)'C' ) == BaseUtils.BaseSubstitutionType.TRANSVERSION ); Assert.assertTrue( BaseUtils.SNPSubstitutionType( (byte)'A', (byte)'G' ) == BaseUtils.BaseSubstitutionType.TRANSITION ); @@ -89,6 +93,8 @@ public class BaseUtilsUnitTest extends VariantBaseTest { @Test public void testReverseComplementString() { + logger.warn("Executing testReverseComplementString"); + compareRCStringToExpected("ACGGT", "ACCGT"); compareRCStringToExpected("TCGTATATCTCGCTATATATATATAGCTCTAGTATA", "TATACTAGAGCTATATATATATAGCGAGATATACGA"); compareRCStringToExpected("AAAN", "NTTT"); diff --git a/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java index b65811103..0c1b5b069 100644 --- a/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/fasta/CachingIndexedFastaSequenceFileUnitTest.java @@ -252,7 +252,7 @@ public class CachingIndexedFastaSequenceFileUnitTest extends BaseTest { Assert.assertEquals(changingNs, preservingNs + 4); } - @Test(enabled = true, expectedExceptions = {IllegalStateException.class}) + @Test(enabled = true, expectedExceptions = {UserException.class}) public void testFailOnBadBase() throws FileNotFoundException, InterruptedException { final String testFasta = privateTestDir + "problematicFASTA.fasta"; final CachingIndexedFastaSequenceFile fasta = new CachingIndexedFastaSequenceFile(new File(testFasta)); diff --git a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java index 20971643e..b01c53e77 100644 --- a/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/sam/ReadUtilsUnitTest.java @@ -27,7 +27,7 @@ package org.broadinstitute.sting.utils.sam; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.variant.utils.BaseUtils; +import org.broadinstitute.sting.utils.BaseUtils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; diff --git a/public/java/test/org/broadinstitute/variant/variantcontext/GenotypeLikelihoodsUnitTest.java b/public/java/test/org/broadinstitute/variant/variantcontext/GenotypeLikelihoodsUnitTest.java index fb180c476..562130101 100644 --- a/public/java/test/org/broadinstitute/variant/variantcontext/GenotypeLikelihoodsUnitTest.java +++ b/public/java/test/org/broadinstitute/variant/variantcontext/GenotypeLikelihoodsUnitTest.java @@ -31,7 +31,6 @@ package org.broadinstitute.variant.variantcontext; import org.broad.tribble.TribbleException; import org.broadinstitute.variant.VariantBaseTest; -import org.broadinstitute.variant.utils.BaseUtils; import org.broadinstitute.variant.utils.GeneralUtils; import org.testng.Assert; import org.testng.annotations.Test; @@ -154,9 +153,9 @@ public class GenotypeLikelihoodsUnitTest extends VariantBaseTest { public void testGetQualFromLikelihoodsMultiAllelic() { GenotypeLikelihoods gl = GenotypeLikelihoods.fromLog10Likelihoods(triAllelic); - Allele ref = Allele.create(BaseUtils.Base.A.base,true); - Allele alt1 = Allele.create(BaseUtils.Base.C.base); - Allele alt2 = Allele.create(BaseUtils.Base.T.base); + Allele ref = Allele.create((byte)'A',true); + Allele alt1 = Allele.create((byte)'C'); + Allele alt2 = Allele.create((byte)'T'); List allAlleles = Arrays.asList(ref,alt1,alt2); List gtAlleles = Arrays.asList(alt1,alt2); GenotypeBuilder gtBuilder = new GenotypeBuilder(); From 85dabd321f342e8893f72fb3f64892a4f977b777 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 30 Jan 2013 13:26:07 -0500 Subject: [PATCH 06/11] Adding unit tests for hierarchicalBayesianQualityEstimate function --- .../recalibration/BaseRecalibration.java | 2 ++ .../recalibration/RecalDatumUnitTest.java | 33 +++++++++---------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/protected/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java index f3d4a3096..bb62cd74d 100644 --- a/protected/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java +++ b/protected/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -46,6 +46,7 @@ package org.broadinstitute.sting.utils.recalibration; +import com.google.java.contract.Ensures; import net.sf.samtools.SAMTag; import net.sf.samtools.SAMUtils; import org.apache.log4j.Logger; @@ -188,6 +189,7 @@ public class BaseRecalibration { } } + @Ensures("result > 0.0") protected static double hierarchicalBayesianQualityEstimate( final double epsilon, final RecalDatum empiricalQualRG, final RecalDatum empiricalQualQS, final List empiricalQualCovs ) { final double globalDeltaQ = ( empiricalQualRG == null ? 0.0 : empiricalQualRG.getEmpiricalQuality(epsilon) - epsilon ); final double deltaQReported = ( empiricalQualQS == null ? 0.0 : empiricalQualQS.getEmpiricalQuality(globalDeltaQ + epsilon) - (globalDeltaQ + epsilon) ); diff --git a/protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java b/protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java index 37f0fcc0c..da78932d1 100644 --- a/protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java +++ b/protected/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java @@ -50,6 +50,7 @@ package org.broadinstitute.sting.utils.recalibration; // the imports for unit testing. +import org.apache.commons.lang.ArrayUtils; import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; @@ -57,6 +58,7 @@ import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; +import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; @@ -281,32 +283,27 @@ public class RecalDatumUnitTest extends BaseTest { @Test public void basicHierarchicalBayesianQualityEstimateTest() { - { - double RG_Q = 45.0; - RecalDatum RG = new RecalDatum( (long)10000000, (long) (10000000 * 1.0 / (Math.pow(10.0, RG_Q/10.0))), (byte)RG_Q); - double Q = 30.0; - RecalDatum QS = new RecalDatum( (long)10000000, (long) (10000000 * 1.0 / (Math.pow(10.0, Q/10.0))), (byte)Q); - RecalDatum COV = new RecalDatum( (long)150, (long) 1, (byte)45.0); // no data here so Bayesian prior has a huge effect on the empirical quality - Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 45.0, RG, QS, Collections.singletonList(COV)), Q, 1E-4 ); + for( double epsilon = 15.0; epsilon <= 60.0; epsilon += 2.0 ) { + double RG_Q = 45.0; + RecalDatum RG = new RecalDatum( (long)100000000, (long) (100000000 * 1.0 / (Math.pow(10.0, RG_Q/10.0))), (byte)RG_Q); + double Q = 30.0; + RecalDatum QS = new RecalDatum( (long)100000000, (long) (100000000 * 1.0 / (Math.pow(10.0, Q/10.0))), (byte)Q); + RecalDatum COV = new RecalDatum( (long)15, (long) 1, (byte)45.0); // no data here so Bayesian prior has a huge effect on the empirical quality + // initial epsilon condition shouldn't matter when there are a lot of observations - Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 25.0, RG, QS, Collections.singletonList(COV)), Q, 1E-4 ); - Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 5.0, RG, QS, Collections.singletonList(COV)), Q, 1E-4 ); - Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 65.0, RG, QS, Collections.singletonList(COV)), Q, 1E-4 ); + Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( epsilon, RG, QS, Collections.singletonList(COV)), Q, 1E-4 ); } - { + for( double epsilon = 15.0; epsilon <= 60.0; epsilon += 2.0 ) { double RG_Q = 45.0; - RecalDatum RG = new RecalDatum( (long)100, (long) (100 * 1.0 / (Math.pow(10.0, RG_Q/10.0))), (byte)RG_Q); + RecalDatum RG = new RecalDatum( (long)10, (long) (10 * 1.0 / (Math.pow(10.0, RG_Q/10.0))), (byte)RG_Q); double Q = 30.0; - RecalDatum QS = new RecalDatum( (long)100, (long) (100 * 1.0 / (Math.pow(10.0, Q/10.0))), (byte)Q); - RecalDatum COV = new RecalDatum( (long)150, (long) 1, (byte)45.0); // no data here so Bayesian prior has a huge effect on the empirical quality + RecalDatum QS = new RecalDatum( (long)10, (long) (10 * 1.0 / (Math.pow(10.0, Q/10.0))), (byte)Q); + RecalDatum COV = new RecalDatum( (long)15, (long) 1, (byte)45.0); // no data here so Bayesian prior has a huge effect on the empirical quality // initial epsilon condition dominates when there is no data - Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 45.0, RG, QS, Collections.singletonList(COV)), 45.0, 1E-4 ); - Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 25.0, RG, QS, Collections.singletonList(COV)), 25.0, 1E-4 ); - Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 5.0, RG, QS, Collections.singletonList(COV)), 5.0, 1E-4 ); - Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( 65.0, RG, QS, Collections.singletonList(COV)), 65.0, 1E-4 ); + Assert.assertEquals(BaseRecalibration.hierarchicalBayesianQualityEstimate( epsilon, RG, QS, Collections.singletonList(COV)), epsilon, 1E-4 ); } } From ff8ba03249ff81c523f02dfe753eb46ba7611a60 Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 30 Jan 2013 13:30:18 -0500 Subject: [PATCH 07/11] Updating BQSR integration test md5s to reflect the updates to the hierarchicalBayesianQualityEstimate function --- .../sting/gatk/walkers/bqsr/BQSRIntegrationTest.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java index 254bfd92c..1f4875298 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java @@ -188,12 +188,12 @@ public class BQSRIntegrationTest extends WalkerTest { public Object[][] createPRTestData() { List tests = new ArrayList(); - tests.add(new Object[]{1, new PRTest(" -qq -1", "64622d37e234e57cf472068a00b4ec89")}); - tests.add(new Object[]{1, new PRTest(" -qq 6", "735270581d1ee836f3f99d68ecbd5f24")}); - tests.add(new Object[]{1, new PRTest(" -DIQ", "1e3b0a11f246a238ac4c06b084142715")}); + tests.add(new Object[]{1, new PRTest(" -qq -1", "b8d296fb78adc5cff7ce12073a69d985")}); + tests.add(new Object[]{1, new PRTest(" -qq 6", "2ee0cedf84c1a33a807438172bc36c11")}); + tests.add(new Object[]{1, new PRTest(" -DIQ", "46cf74900bf8b13438e6a195b3085c48")}); for ( final int nct : Arrays.asList(1, 2, 4) ) { - tests.add(new Object[]{nct, new PRTest("", "39ecde2c2ba4a7c109e65372618a419a")}); + tests.add(new Object[]{nct, new PRTest("", "88b88f006ebb1aade85089bfba5a9e8d")}); } return tests.toArray(new Object[][]{}); From 591df2be4473bbf5845a4c1548a4ed1f4c3bbb09 Mon Sep 17 00:00:00 2001 From: David Roazen Date: Wed, 30 Jan 2013 13:46:59 -0500 Subject: [PATCH 08/11] Move additional VariantContext utility methods back to the GATK Thanks to Eric for his feedback --- .../walkers/phasing/ReadBackedPhasing.java | 2 +- .../walkers/variantutils/CombineVariants.java | 4 +- .../variantutils/ConcordanceMetrics.java | 8 +- .../walkers/variantutils/VariantsToVCF.java | 3 +- .../variant/GATKVariantContextUtils.java | 119 ++++++++++++++++++ .../variantcontext/VariantContext.java | 4 + .../variantcontext/VariantContextUtils.java | 117 ----------------- 7 files changed, 131 insertions(+), 126 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java index 980894112..fe38461c5 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasing.java @@ -320,7 +320,7 @@ public class ReadBackedPhasing extends RodWalker subsetAttributes(final CommonInfo igc, final Collection keysToPreserve) { + Map attributes = new HashMap(keysToPreserve.size()); + for ( final String key : keysToPreserve ) { + if ( igc.hasAttribute(key) ) + attributes.put(key, igc.getAttribute(key)); + } + return attributes; + } + + /** + * @deprecated use variant context builder version instead + * @param vc the variant context + * @param keysToPreserve the keys to preserve + * @return a pruned version of the original variant context + */ + @Deprecated + public static VariantContext pruneVariantContext(final VariantContext vc, Collection keysToPreserve ) { + return pruneVariantContext(new VariantContextBuilder(vc), keysToPreserve).make(); + } + + public static VariantContextBuilder pruneVariantContext(final VariantContextBuilder builder, Collection keysToPreserve ) { + final VariantContext vc = builder.make(); + if ( keysToPreserve == null ) keysToPreserve = Collections.emptyList(); + + // VC info + final Map attributes = subsetAttributes(vc.getCommonInfo(), keysToPreserve); + + // Genotypes + final GenotypesContext genotypes = GenotypesContext.create(vc.getNSamples()); + for ( final Genotype g : vc.getGenotypes() ) { + final GenotypeBuilder gb = new GenotypeBuilder(g); + // remove AD, DP, PL, and all extended attributes, keeping just GT and GQ + gb.noAD().noDP().noPL().noAttributes(); + genotypes.add(gb.make()); + } + + return builder.genotypes(genotypes).attributes(attributes); + } + + public static boolean allelesAreSubset(VariantContext vc1, VariantContext vc2) { + // if all alleles of vc1 are a contained in alleles of vc2, return true + if (!vc1.getReference().equals(vc2.getReference())) + return false; + + for (Allele a :vc1.getAlternateAlleles()) { + if (!vc2.getAlternateAlleles().contains(a)) + return false; + } + + return true; + } + + public static Map> separateVariantContextsByType(Collection VCs) { + HashMap> mappedVCs = new HashMap>(); + for ( VariantContext vc : VCs ) { + + // look at previous variant contexts of different type. If: + // a) otherVC has alleles which are subset of vc, remove otherVC from its list and add otherVC to vc's list + // b) vc has alleles which are subset of otherVC. Then, add vc to otherVC's type list (rather, do nothing since vc will be added automatically to its list) + // c) neither: do nothing, just add vc to its own list + boolean addtoOwnList = true; + for (VariantContext.Type type : VariantContext.Type.values()) { + if (type.equals(vc.getType())) + continue; + + if (!mappedVCs.containsKey(type)) + continue; + + List vcList = mappedVCs.get(type); + for (int k=0; k < vcList.size(); k++) { + VariantContext otherVC = vcList.get(k); + if (allelesAreSubset(otherVC,vc)) { + // otherVC has a type different than vc and its alleles are a subset of vc: remove otherVC from its list and add it to vc's type list + vcList.remove(k); + // avoid having empty lists + if (vcList.size() == 0) + mappedVCs.remove(type); + if ( !mappedVCs.containsKey(vc.getType()) ) + mappedVCs.put(vc.getType(), new ArrayList()); + mappedVCs.get(vc.getType()).add(otherVC); + break; + } + else if (allelesAreSubset(vc,otherVC)) { + // vc has a type different than otherVC and its alleles are a subset of VC: add vc to otherVC's type list and don't add to its own + mappedVCs.get(type).add(vc); + addtoOwnList = false; + break; + } + } + } + if (addtoOwnList) { + if ( !mappedVCs.containsKey(vc.getType()) ) + mappedVCs.put(vc.getType(), new ArrayList()); + mappedVCs.get(vc.getType()).add(vc); + } + } + + return mappedVCs; + } + + public static VariantContext purgeUnallowedGenotypeAttributes(VariantContext vc, Set allowedAttributes) { + if ( allowedAttributes == null ) + return vc; + + GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); + for ( final Genotype genotype : vc.getGenotypes() ) { + Map attrs = new HashMap(); + for ( Map.Entry attr : genotype.getExtendedAttributes().entrySet() ) { + if ( allowedAttributes.contains(attr.getKey()) ) + attrs.put(attr.getKey(), attr.getValue()); + } + newGenotypes.add(new GenotypeBuilder(genotype).attributes(attrs).make()); + } + + return new VariantContextBuilder(vc).genotypes(newGenotypes).make(); + } + + private static class AlleleMapper { private VariantContext vc = null; private Map map = null; diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/VariantContext.java b/public/java/src/org/broadinstitute/variant/variantcontext/VariantContext.java index 05edee153..1fce89431 100644 --- a/public/java/src/org/broadinstitute/variant/variantcontext/VariantContext.java +++ b/public/java/src/org/broadinstitute/variant/variantcontext/VariantContext.java @@ -625,6 +625,10 @@ public class VariantContext implements Feature { // to enable tribble integratio public double getAttributeAsDouble(String key, double defaultValue) { return commonInfo.getAttributeAsDouble(key, defaultValue); } public boolean getAttributeAsBoolean(String key, boolean defaultValue) { return commonInfo.getAttributeAsBoolean(key, defaultValue); } + public CommonInfo getCommonInfo() { + return commonInfo; + } + // --------------------------------------------------------------------------------------------------------- // // Working with alleles diff --git a/public/java/src/org/broadinstitute/variant/variantcontext/VariantContextUtils.java b/public/java/src/org/broadinstitute/variant/variantcontext/VariantContextUtils.java index 11d84e318..a5b7b6c04 100644 --- a/public/java/src/org/broadinstitute/variant/variantcontext/VariantContextUtils.java +++ b/public/java/src/org/broadinstitute/variant/variantcontext/VariantContextUtils.java @@ -316,106 +316,6 @@ public class VariantContextUtils { return r; } - private final static Map subsetAttributes(final CommonInfo igc, final Collection keysToPreserve) { - Map attributes = new HashMap(keysToPreserve.size()); - for ( final String key : keysToPreserve ) { - if ( igc.hasAttribute(key) ) - attributes.put(key, igc.getAttribute(key)); - } - return attributes; - } - - /** - * @deprecated use variant context builder version instead - * @param vc the variant context - * @param keysToPreserve the keys to preserve - * @return a pruned version of the original variant context - */ - @Deprecated - public static VariantContext pruneVariantContext(final VariantContext vc, Collection keysToPreserve ) { - return pruneVariantContext(new VariantContextBuilder(vc), keysToPreserve).make(); - } - - public static VariantContextBuilder pruneVariantContext(final VariantContextBuilder builder, Collection keysToPreserve ) { - final VariantContext vc = builder.make(); - if ( keysToPreserve == null ) keysToPreserve = Collections.emptyList(); - - // VC info - final Map attributes = subsetAttributes(vc.commonInfo, keysToPreserve); - - // Genotypes - final GenotypesContext genotypes = GenotypesContext.create(vc.getNSamples()); - for ( final Genotype g : vc.getGenotypes() ) { - final GenotypeBuilder gb = new GenotypeBuilder(g); - // remove AD, DP, PL, and all extended attributes, keeping just GT and GQ - gb.noAD().noDP().noPL().noAttributes(); - genotypes.add(gb.make()); - } - - return builder.genotypes(genotypes).attributes(attributes); - } - - public static boolean allelesAreSubset(VariantContext vc1, VariantContext vc2) { - // if all alleles of vc1 are a contained in alleles of vc2, return true - if (!vc1.getReference().equals(vc2.getReference())) - return false; - - for (Allele a :vc1.getAlternateAlleles()) { - if (!vc2.getAlternateAlleles().contains(a)) - return false; - } - - return true; - } - - public static Map> separateVariantContextsByType(Collection VCs) { - HashMap> mappedVCs = new HashMap>(); - for ( VariantContext vc : VCs ) { - - // look at previous variant contexts of different type. If: - // a) otherVC has alleles which are subset of vc, remove otherVC from its list and add otherVC to vc's list - // b) vc has alleles which are subset of otherVC. Then, add vc to otherVC's type list (rather, do nothing since vc will be added automatically to its list) - // c) neither: do nothing, just add vc to its own list - boolean addtoOwnList = true; - for (VariantContext.Type type : VariantContext.Type.values()) { - if (type.equals(vc.getType())) - continue; - - if (!mappedVCs.containsKey(type)) - continue; - - List vcList = mappedVCs.get(type); - for (int k=0; k < vcList.size(); k++) { - VariantContext otherVC = vcList.get(k); - if (allelesAreSubset(otherVC,vc)) { - // otherVC has a type different than vc and its alleles are a subset of vc: remove otherVC from its list and add it to vc's type list - vcList.remove(k); - // avoid having empty lists - if (vcList.size() == 0) - mappedVCs.remove(type); - if ( !mappedVCs.containsKey(vc.getType()) ) - mappedVCs.put(vc.getType(), new ArrayList()); - mappedVCs.get(vc.getType()).add(otherVC); - break; - } - else if (allelesAreSubset(vc,otherVC)) { - // vc has a type different than otherVC and its alleles are a subset of VC: add vc to otherVC's type list and don't add to its own - mappedVCs.get(type).add(vc); - addtoOwnList = false; - break; - } - } - } - if (addtoOwnList) { - if ( !mappedVCs.containsKey(vc.getType()) ) - mappedVCs.put(vc.getType(), new ArrayList()); - mappedVCs.get(vc.getType()).add(vc); - } - } - - return mappedVCs; - } - // TODO: remove that after testing // static private void verifyUniqueSampleNames(Collection unsortedVCs) { // Set names = new HashSet(); @@ -431,23 +331,6 @@ public class VariantContextUtils { // } - public static VariantContext purgeUnallowedGenotypeAttributes(VariantContext vc, Set allowedAttributes) { - if ( allowedAttributes == null ) - return vc; - - GenotypesContext newGenotypes = GenotypesContext.create(vc.getNSamples()); - for ( final Genotype genotype : vc.getGenotypes() ) { - Map attrs = new HashMap(); - for ( Map.Entry attr : genotype.getExtendedAttributes().entrySet() ) { - if ( allowedAttributes.contains(attr.getKey()) ) - attrs.put(attr.getKey(), attr.getValue()); - } - newGenotypes.add(new GenotypeBuilder(genotype).attributes(attrs).make()); - } - - return new VariantContextBuilder(vc).genotypes(newGenotypes).make(); - } - public static int getSize( VariantContext vc ) { return vc.getEnd() - vc.getStart() + 1; } From 5f4a063def1832283157494a8acc1cbedf43502d Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 30 Jan 2013 16:14:07 -0500 Subject: [PATCH 09/11] Breaking up my massive commits into smaller pieces that I can successfully merge and digest. This one enables downsampling in the HaplotypeCaller (by lowering the default dcov to 20) and removes my long-standing, temporary region-based downsampling. --- .../haplotypecaller/HaplotypeCaller.java | 36 +++---------------- .../HaplotypeCallerIntegrationTest.java | 24 ++++++------- 2 files changed, 16 insertions(+), 44 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index a27eaac8e..6cfbc3830 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -56,6 +56,7 @@ import org.broadinstitute.sting.gatk.arguments.StandardCallerArgumentCollection; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.AlignmentContextUtils; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.downsampling.DownsampleType; import org.broadinstitute.sting.gatk.filters.BadMateFilter; import org.broadinstitute.sting.gatk.io.StingSAMFileWriter; import org.broadinstitute.sting.gatk.iterators.ReadTransformer; @@ -132,7 +133,7 @@ import java.util.*; @PartitionBy(PartitionType.LOCUS) @BAQMode(ApplicationTime = ReadTransformer.ApplicationTime.FORBIDDEN) @ActiveRegionTraversalParameters(extension=65, maxRegion=300) -//@Downsample(by= DownsampleType.BY_SAMPLE, toCoverage=5) +@Downsample(by= DownsampleType.BY_SAMPLE, toCoverage=20) public class HaplotypeCaller extends ActiveRegionWalker implements AnnotatorCompatible { /** @@ -180,9 +181,6 @@ public class HaplotypeCaller extends ActiveRegionWalker implem @Argument(fullName="minKmer", shortName="minKmer", doc="Minimum kmer length to use in the assembly graph", required = false) protected int minKmer = 11; - @Argument(fullName="downsampleRegion", shortName="dr", doc="coverage, per-sample, to downsample each active region to", required = false) - protected int DOWNSAMPLE_PER_SAMPLE_PER_REGION = 1000; - /** * If this flag is provided, the haplotype caller will include unmapped reads in the assembly and calling * when these reads occur in the region being analyzed. Typically, for paired end analyses, one pair of the @@ -463,7 +461,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem allelesToGenotype.removeAll( activeAllelesToGenotype ); } - if( !activeRegion.isActive()) { return 0; } // Not active so nothing to do! + if( !activeRegion.isActive() ) { return 0; } // Not active so nothing to do! if( activeRegion.size() == 0 && UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { return 0; } // No reads here so nothing to do! if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES && activeAllelesToGenotype.isEmpty() ) { return 0; } // No alleles found in this region so nothing to do! @@ -474,7 +472,6 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader), true); // Create the reference haplotype which is the bases from the reference that make up the active region final byte[] fullReferenceWithPadding = activeRegion.getFullReference(referenceReader, REFERENCE_PADDING); - //int PRUNE_FACTOR = Math.max(MIN_PRUNE_FACTOR, determinePruneFactorFromCoverage( activeRegion )); final ArrayList haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, fullSpanBeforeClipping, MIN_PRUNE_FACTOR, activeAllelesToGenotype ); if( haplotypes.size() == 1 ) { return 1; } // only the reference haplotype remains so nothing else to do! @@ -571,18 +568,13 @@ public class HaplotypeCaller extends ActiveRegionWalker implem finalizedReadList.addAll( FragmentUtils.mergeOverlappingPairedFragments(overlappingPair) ); } - Collections.shuffle(finalizedReadList, GenomeAnalysisEngine.getRandomGenerator()); - // Loop through the reads hard clipping the adaptor and low quality tails final ArrayList readsToUse = new ArrayList(finalizedReadList.size()); for( final GATKSAMRecord myRead : finalizedReadList ) { final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) ); if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) { final GATKSAMRecord clippedRead = ReadClipper.hardClipLowQualEnds( postAdapterRead, MIN_TAIL_QUALITY ); - // protect against INTERVALS with abnormally high coverage - // TODO BUGBUG: remove when positional downsampler is hooked up to ART/HC - if( activeRegion.readOverlapsRegion(clippedRead) && - clippedRead.getReadLength() > 0 && activeRegion.size() < samplesList.size() * DOWNSAMPLE_PER_SAMPLE_PER_REGION ) { + if( activeRegion.readOverlapsRegion(clippedRead) && clippedRead.getReadLength() > 0 ) { readsToUse.add(clippedRead); } } @@ -711,24 +703,4 @@ public class HaplotypeCaller extends ActiveRegionWalker implem return new Cigar(readCigarElements); } - - /* - private int determinePruneFactorFromCoverage( final ActiveRegion activeRegion ) { - final ArrayList readLengthDistribution = new ArrayList(); - for( final GATKSAMRecord read : activeRegion.getReads() ) { - readLengthDistribution.add(read.getReadLength()); - } - final double meanReadLength = MathUtils.average(readLengthDistribution); - final double meanCoveragePerSample = (double) activeRegion.getReads().size() / ((double) activeRegion.getExtendedLoc().size() / meanReadLength) / (double) samplesList.size(); - int PRUNE_FACTOR = 0; - if( meanCoveragePerSample > 8.5 ) { - PRUNE_FACTOR = (int) Math.floor( Math.sqrt( meanCoveragePerSample - 5.0 ) ); - } else if( meanCoveragePerSample > 3.0 ) { - PRUNE_FACTOR = 1; - } - - if( DEBUG ) { System.out.println(String.format("Mean coverage per sample = %.1f --> prune factor = %d", meanCoveragePerSample, PRUNE_FACTOR)); } - return PRUNE_FACTOR; - } - */ } \ No newline at end of file diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index d446da830..ad682734c 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -68,12 +68,12 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "664a14590d0966e63d3aabff2d7bab0a"); + HCTest(CEUTRIO_BAM, "", "72ce6a5e46644dfd73aeffba9d6131ea"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "111f3dc086a3cea1be9bd1ad6e1d18ed"); + HCTest(NA12878_BAM, "", "f9d696391f1f337092d70e3abcd32bfb"); } @Test(enabled = false) @@ -84,7 +84,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "c70f753f7918a1c670ce4ed5c66de09e"); + "4e8beb2cdc3d77427f14acf37cea2bd0"); } private void HCTestComplexGGA(String bam, String args, String md5) { @@ -96,13 +96,13 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleGGAComplex() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:119673-119823 -L 20:121408-121538", - "b1d3070f0c49becf34101e480ab6c589"); + "75e1df0dcf3728fd2b6e4735c4cc88ce"); } @Test public void testHaplotypeCallerMultiSampleGGAMultiAllelic() { HCTestComplexGGA(NA12878_CHR20_BAM, "-L 20:133041-133161 -L 20:300207-300337", - "20eba2e54266f6aebf35b7b7b7e754e3"); + "1d244f2adbc72a0062eb673d56cbb5a8"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -113,7 +113,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "f9805488d85e59e1ae002d0d32d7011d"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "a1bc844f62a9cb60dbb70d00ad36b85d"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -124,7 +124,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "4544a2916f46f58b32db8008776b91a3"); + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "23956e572f19ff26d25bbdfaa307675b"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -135,7 +135,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "f3984a91e7562494c2a7e41fd05a6734"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "1255f466aa2d288f015cd55d8fece1ac"); } // That problem bam came from a user on the forum and it spotted a problem where the ReadClipper @@ -146,14 +146,14 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("3e9e025c539be6c5e0d0f2e5d8e86a62")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("103c91c4a78164949e166d3d27eb459b")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("34129e6c6310ef4eeeeb59b0e7ac0464")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("87fe31a4bbd68a9eb5d5910db5011c82")); executeTest("HCTestStructuralIndels: ", spec); } @@ -175,7 +175,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("5f4c07aaf1d2d34cccce43196a5fbd71")); + Arrays.asList("0fa19ec5cf737a3445544b59ecc995e9")); executeTest("HC calling on a ReducedRead BAM", spec); } @@ -183,7 +183,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void testReducedBamWithReadsNotFullySpanningDeletion() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "reduced.readNotFullySpanningDeletion.bam -o %s -L 1:167871297", 1, - Arrays.asList("6ead001b1f8e7cb433fd335f78fde5f0")); + Arrays.asList("5f4cbdcc9bffee6bba258dfac89492ed")); executeTest("test calling on a ReducedRead BAM where the reads do not fully span a deletion", spec); } } From b70733133260bc76cdd1cfcb6efdc89107f0f005 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 30 Jan 2013 16:41:23 -0500 Subject: [PATCH 10/11] Encrypt GATK AWS keys using the GATK private key, and decrypt as needed as a resource when uploading to AWS logs -- Has the overall effect that the GATK user AWS keys are no longer visible in the gatk source as plain text. This will stop AWS from emailing me (they crawl the web looking for keys) -- Added utility EncryptAWSKeys that takes as command line arguments the GATK user AWS access and secret keys, encrypts them with the GATK private key, and writes out the resulting file to resources in phonehome. -- GATKRunReport now decrypts as needed these keys using the GATK public key as resources in the GATK bundle -- Refactored the essential function of Resource (reading the resource) from IOUtils into the class itself. Now how to get the data in the resouce is straightforward -- Refactored md5 calculation code from a byte[] into Utils. Added unit tests -- Committing the encrypted AWS keys -- #resolves https://jira.broadinstitute.org/browse/GSA-730 --- build.xml | 3 + .../sting/gatk/phonehome/GATKRunReport.java | 59 ++++++++++++++++-- .../sting/gatk/phonehome/GATK_AWS_access.key | 2 + .../sting/gatk/phonehome/GATK_AWS_secret.key | Bin 0 -> 256 bytes .../org/broadinstitute/sting/utils/Utils.java | 28 +++++++++ .../sting/utils/io/IOUtils.java | 12 +--- .../sting/utils/io/Resource.java | 24 +++++++ .../test/org/broadinstitute/sting/MD5DB.java | 9 +-- .../gatk/phonehome/GATKRunReportUnitTest.java | 52 +++++++++++++++ .../sting/utils/UtilsUnitTest.java | 15 +++++ 10 files changed, 181 insertions(+), 23 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/phonehome/GATK_AWS_access.key create mode 100644 public/java/src/org/broadinstitute/sting/gatk/phonehome/GATK_AWS_secret.key create mode 100644 public/java/test/org/broadinstitute/sting/gatk/phonehome/GATKRunReportUnitTest.java diff --git a/build.xml b/build.xml index e92e41c10..1e88bb400 100644 --- a/build.xml +++ b/build.xml @@ -708,6 +708,9 @@ + + + diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java index c2a064dbe..839ebcfa1 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATKRunReport.java @@ -31,8 +31,11 @@ import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.crypt.CryptUtils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; +import org.broadinstitute.sting.utils.io.IOUtils; +import org.broadinstitute.sting.utils.io.Resource; import org.broadinstitute.sting.utils.threading.ThreadEfficiencyMonitor; import org.jets3t.service.S3Service; import org.jets3t.service.S3ServiceException; @@ -48,6 +51,7 @@ import org.simpleframework.xml.stream.HyphenStyle; import java.io.*; import java.security.NoSuchAlgorithmException; +import java.security.PublicKey; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.ArrayList; @@ -309,6 +313,51 @@ public class GATKRunReport { } } + /** + * Decrypts encrypted AWS key from encryptedKeySource + * @param encryptedKeySource a file containing an encrypted AWS key + * @return a decrypted AWS key as a String + */ + public static String decryptAWSKey(final File encryptedKeySource) throws FileNotFoundException { + return decryptAWSKey(new FileInputStream(encryptedKeySource)); + } + + /** + * @see #decryptAWSKey(java.io.File) but with input from an inputstream + */ + private static String decryptAWSKey(final InputStream encryptedKeySource) { + final PublicKey key = CryptUtils.loadGATKDistributedPublicKey(); + final byte[] fromDisk = IOUtils.readStreamIntoByteArray(encryptedKeySource); + final byte[] decrypted = CryptUtils.decryptData(fromDisk, key); + return new String(decrypted); + } + + /** + * Get the decrypted AWS key sorted in the resource directories of name + * @param name the name of the file containing the needed AWS key + * @return a non-null GATK + */ + private static String getAWSKey(final String name) { + final Resource resource = new Resource(name, GATKRunReport.class); + return decryptAWSKey(resource.getResourceContentsAsStream()); + } + + /** + * Get the AWS access key for the GATK user + * @return a non-null AWS access key for the GATK user + */ + protected static String getAWSAccessKey() { + return getAWSKey("GATK_AWS_access.key"); + } + + /** + * Get the AWS secret key for the GATK user + * @return a non-null AWS secret key for the GATK user + */ + protected static String getAWSSecretKey() { + return getAWSKey("GATK_AWS_secret.key"); + } + private class S3PutRunnable implements Runnable { public AtomicBoolean isSuccess; @@ -331,17 +380,17 @@ public class GATKRunReport { // are stored in an AWSCredentials object: // IAM GATK user credentials -- only right is to PutObject into GATK_Run_Report bucket - String awsAccessKey = "AKIAJXU7VIHBPDW4TDSQ"; // GATK AWS user - String awsSecretKey = "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"; // GATK AWS user - AWSCredentials awsCredentials = new AWSCredentials(awsAccessKey, awsSecretKey); + final String awsAccessKey = getAWSAccessKey(); // GATK AWS user + final String awsSecretKey = getAWSSecretKey(); // GATK AWS user + final AWSCredentials awsCredentials = new AWSCredentials(awsAccessKey, awsSecretKey); // To communicate with S3, create a class that implements an S3Service. We will use the REST/HTTP // implementation based on HttpClient, as this is the most robust implementation provided with JetS3t. - S3Service s3Service = new RestS3Service(awsCredentials); + final S3Service s3Service = new RestS3Service(awsCredentials); // Create an S3Object based on a file, with Content-Length set automatically and // Content-Type set based on the file's extension (using the Mimetypes utility class) - S3Object fileObject = new S3Object(key, report); + final S3Object fileObject = new S3Object(key, report); //logger.info("Created S3Object" + fileObject); //logger.info("Uploading " + localFile + " to AWS bucket"); s3Object = s3Service.putObject(REPORT_BUCKET_NAME, fileObject); diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATK_AWS_access.key b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATK_AWS_access.key new file mode 100644 index 000000000..45242c3cd --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATK_AWS_access.key @@ -0,0 +1,2 @@ +BwX~[uGe,툉)s/sg1LQaG%$R݊{xqPz׊J}\{(BK&ܶ,`@`oX +%/`m֑Ȥ3h3rQ v1aW?uHsaz޿M˦?Uzhk#+xĄMk X)MıՕ ip-?jH@d+HR|bF \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATK_AWS_secret.key b/public/java/src/org/broadinstitute/sting/gatk/phonehome/GATK_AWS_secret.key new file mode 100644 index 0000000000000000000000000000000000000000..b86a25036e84ae8a1373776e52f1ef04a135bfd0 GIT binary patch literal 256 zcmV+b0ssClQ_lh#EL zNy%zD4$cZr3OHzad*T{?JVK|@Tkm)?rp1pN33JU-fi{b*XQH?g&z%d%(ha|KT`EjN zBSe){t0~(z%*aiOc6s(*c)DT0;m|PNz6Iq-NLK3b$Te4ViCd4fXKs%s(Pj` clazz = resource.getRelativeClass(); - InputStream inputStream = null; + InputStream inputStream = resource.getResourceContentsAsStream(); OutputStream outputStream = null; try { - if (clazz == null) { - inputStream = ClassLoader.getSystemResourceAsStream(path); - if (inputStream == null) - throw new IllegalArgumentException("Resource not found: " + path); - } else { - inputStream = clazz.getResourceAsStream(path); - if (inputStream == null) - throw new IllegalArgumentException("Resource not found relative to " + clazz + ": " + path); - } outputStream = FileUtils.openOutputStream(file); org.apache.commons.io.IOUtils.copy(inputStream, outputStream); } catch (IOException e) { diff --git a/public/java/src/org/broadinstitute/sting/utils/io/Resource.java b/public/java/src/org/broadinstitute/sting/utils/io/Resource.java index 1f181a826..85ca5ce1c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/io/Resource.java +++ b/public/java/src/org/broadinstitute/sting/utils/io/Resource.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.utils.io; import java.io.File; +import java.io.InputStream; /** * Stores a resource by path and a relative class. @@ -64,4 +65,27 @@ public class Resource { File.separator, path); } + + /** + * Get the contents of this resource as an InputStream + * @throws IllegalArgumentException if resource cannot be read + * @return an input stream that will read the contents of this resource + */ + public InputStream getResourceContentsAsStream() { + final Class clazz = getRelativeClass(); + + final InputStream inputStream; + if (clazz == null) { + inputStream = ClassLoader.getSystemResourceAsStream(path); + if (inputStream == null) + throw new IllegalArgumentException("Resource not found: " + path); + } else { + inputStream = clazz.getResourceAsStream(path); + if (inputStream == null) + throw new IllegalArgumentException("Resource not found relative to " + clazz + ": " + path); + + } + + return inputStream; + } } diff --git a/public/java/test/org/broadinstitute/sting/MD5DB.java b/public/java/test/org/broadinstitute/sting/MD5DB.java index aed98b78a..2b0d52a11 100644 --- a/public/java/test/org/broadinstitute/sting/MD5DB.java +++ b/public/java/test/org/broadinstitute/sting/MD5DB.java @@ -28,11 +28,10 @@ package org.broadinstitute.sting; import org.apache.commons.io.FileUtils; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.walkers.diffengine.DiffEngine; +import org.broadinstitute.sting.utils.Utils; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import java.io.*; -import java.math.BigInteger; -import java.security.MessageDigest; import java.util.Arrays; /** @@ -252,11 +251,7 @@ public class MD5DB { */ public String testFileMD5(final String name, final File resultsFile, final String expectedMD5, final boolean parameterize) { try { - byte[] bytesOfMessage = getBytesFromFile(resultsFile); - byte[] thedigest = MessageDigest.getInstance("MD5").digest(bytesOfMessage); - BigInteger bigInt = new BigInteger(1, thedigest); - String filemd5sum = bigInt.toString(16); - while (filemd5sum.length() < 32) filemd5sum = "0" + filemd5sum; // pad to length 32 + final String filemd5sum = Utils.calcMD5(getBytesFromFile(resultsFile)); // // copy md5 to integrationtests diff --git a/public/java/test/org/broadinstitute/sting/gatk/phonehome/GATKRunReportUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/phonehome/GATKRunReportUnitTest.java new file mode 100644 index 000000000..03f19968c --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/phonehome/GATKRunReportUnitTest.java @@ -0,0 +1,52 @@ +/* +* Copyright (c) 2012 The Broad Institute +* +* Permission is hereby granted, free of charge, to any person +* obtaining a copy of this software and associated documentation +* files (the "Software"), to deal in the Software without +* restriction, including without limitation the rights to use, +* copy, modify, merge, publish, distribute, sublicense, and/or sell +* copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following +* conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +* THE USE OR OTHER DEALINGS IN THE SOFTWARE. +*/ + +package org.broadinstitute.sting.gatk.phonehome; + +import junit.framework.Assert; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.Utils; +import org.testng.annotations.Test; + +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; + +public class GATKRunReportUnitTest extends BaseTest { + @Test + public void testAccessKey() throws Exception { + testAWSKey(GATKRunReport.getAWSAccessKey(), "c0f0afa1ff5ba41d9bf216cfcdbf26bf"); + } + + @Test + public void testSecretKey() throws Exception { + testAWSKey(GATKRunReport.getAWSSecretKey(), "db2f13b3a7c98ad24e28783733ec4a62"); + } + + private void testAWSKey(final String accessKey, final String expectedMD5) throws Exception { + Assert.assertNotNull(accessKey, "AccessKey should not be null"); + final String actualmd5 = Utils.calcMD5(accessKey); + Assert.assertEquals(actualmd5, expectedMD5); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java index fc10f1102..5d6ecd0f9 100644 --- a/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/UtilsUnitTest.java @@ -25,10 +25,13 @@ package org.broadinstitute.sting.utils; +import org.apache.commons.io.FileUtils; +import org.broadinstitute.sting.utils.io.IOUtils; import org.testng.Assert; import org.broadinstitute.sting.BaseTest; import org.testng.annotations.Test; +import java.io.File; import java.util.LinkedHashMap; import java.util.Map; @@ -135,4 +138,16 @@ public class UtilsUnitTest extends BaseTest { actual = Utils.escapeExpressions(" one two 'three four' "); Assert.assertEquals(actual, expected); } + + @Test + public void testCalcMD5() throws Exception { + final File source = new File(publicTestDir + "exampleFASTA.fasta"); + final String sourceMD5 = "36880691cf9e4178216f7b52e8d85fbe"; + + final byte[] sourceBytes = IOUtils.readFileIntoByteArray(source); + Assert.assertEquals(Utils.calcMD5(sourceBytes), sourceMD5); + + final String sourceString = FileUtils.readFileToString(source); + Assert.assertEquals(Utils.calcMD5(sourceString), sourceMD5); + } } From bb29bd7df75b258cc122cbff0968bb5b8da46c3a Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Wed, 30 Jan 2013 17:09:27 -0500 Subject: [PATCH 11/11] Use base List and Map types in the HaplotypeCaller when possible. --- .../haplotypecaller/GenotypingEngine.java | 44 +++++------ .../haplotypecaller/HaplotypeCaller.java | 26 +++---- .../LikelihoodCalculationEngine.java | 73 ++----------------- .../haplotypecaller/LocalAssemblyEngine.java | 4 +- .../SimpleDeBruijnAssembler.java | 18 ++--- .../broadinstitute/sting/utils/Haplotype.java | 6 +- .../utils/activeregion/ActiveRegion.java | 4 +- .../sting/utils/clipping/ReadClipper.java | 4 +- 8 files changed, 60 insertions(+), 119 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java index d254f5b8b..8b789791d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/GenotypingEngine.java @@ -84,7 +84,7 @@ public class GenotypingEngine { final List haplotypes, final List samples, final Map haplotypeReadMap, - final Map> perSampleFilteredReadList, + final Map> perSampleFilteredReadList, final byte[] ref, final GenomeLoc refLoc, final GenomeLoc activeRegionWindow, @@ -124,12 +124,12 @@ public class GenotypingEngine { // Walk along each position in the key set and create each event to be outputted for( final int loc : startPosKeySet ) { if( loc >= activeRegionWindow.getStart() && loc <= activeRegionWindow.getStop() ) { // genotyping an event inside this active region - final ArrayList eventsAtThisLoc = new ArrayList(); // the overlapping events to merge into a common reference view - final ArrayList priorityList = new ArrayList(); // used to merge overlapping events into common reference view + final List eventsAtThisLoc = new ArrayList(); // the overlapping events to merge into a common reference view + final List priorityList = new ArrayList(); // used to merge overlapping events into common reference view if( !in_GGA_mode ) { for( final Haplotype h : haplotypes ) { - final HashMap eventMap = h.getEventMap(); + final Map eventMap = h.getEventMap(); final VariantContext vc = eventMap.get(loc); if( vc != null && !containsVCWithMatchingAlleles(eventsAtThisLoc, vc) ) { eventsAtThisLoc.add(vc); @@ -142,7 +142,7 @@ public class GenotypingEngine { if( compVC.getStart() == loc ) { int alleleCount = 0; for( final Allele compAltAllele : compVC.getAlternateAlleles() ) { - ArrayList alleleSet = new ArrayList(2); + List alleleSet = new ArrayList(2); alleleSet.add(compVC.getReference()); alleleSet.add(compAltAllele); final String vcSourceName = "Comp" + compCount + "Allele" + alleleCount; @@ -180,7 +180,7 @@ public class GenotypingEngine { if( eventsAtThisLoc.size() != mergedVC.getAlternateAlleles().size() ) { throw new ReviewedStingException("Record size mismatch! Something went wrong in the merging of alleles."); } - final HashMap mergeMap = new HashMap(); + final Map mergeMap = new HashMap(); mergeMap.put(null, mergedVC.getReference()); // the reference event (null) --> the reference allele for(int iii = 0; iii < mergedVC.getAlternateAlleles().size(); iii++) { mergeMap.put(eventsAtThisLoc.get(iii), mergedVC.getAlternateAllele(iii)); // BUGBUG: This is assuming that the order of alleles is the same as the priority list given to simpleMerge function @@ -232,7 +232,7 @@ public class GenotypingEngine { return genotypes; } - private void validatePriorityList( final ArrayList priorityList, final ArrayList eventsAtThisLoc ) { + private void validatePriorityList( final List priorityList, final List eventsAtThisLoc ) { for( final VariantContext vc : eventsAtThisLoc ) { if( !priorityList.contains(vc.getSource()) ) { throw new ReviewedStingException("Event found on haplotype that wasn't added to priority list. Something went wrong in the merging of alleles."); @@ -251,7 +251,7 @@ public class GenotypingEngine { private static Map filterToOnlyOverlappingReads( final GenomeLocParser parser, final Map perSampleReadMap, - final Map> perSampleFilteredReadList, + final Map> perSampleFilteredReadList, final VariantContext call ) { final Map returnMap = new HashMap(); @@ -284,7 +284,7 @@ public class GenotypingEngine { } protected static void cleanUpSymbolicUnassembledEvents( final List haplotypes ) { - final ArrayList haplotypesToRemove = new ArrayList(); + final List haplotypesToRemove = new ArrayList(); for( final Haplotype h : haplotypes ) { for( final VariantContext vc : h.getEventMap().values() ) { if( vc.isSymbolic() ) { @@ -407,7 +407,7 @@ public class GenotypingEngine { // remove the old event from the eventMap on every haplotype and the start pos key set, replace with merged event for( final Haplotype h : haplotypes ) { - final HashMap eventMap = h.getEventMap(); + final Map eventMap = h.getEventMap(); if( eventMap.containsKey(thisStart) && eventMap.containsKey(nextStart) ) { eventMap.remove(thisStart); eventMap.remove(nextStart); @@ -418,7 +418,7 @@ public class GenotypingEngine { boolean containsStart = false; boolean containsNext = false; for( final Haplotype h : haplotypes ) { - final HashMap eventMap = h.getEventMap(); + final Map eventMap = h.getEventMap(); if( eventMap.containsKey(thisStart) ) { containsStart = true; } if( eventMap.containsKey(nextStart) ) { containsNext = true; } } @@ -457,7 +457,7 @@ public class GenotypingEngine { if( refBases.length == altBases.length ) { // insertion + deletion of same length creates an MNP --> trim common prefix bases off the beginning of the allele while( iii < refBases.length && refBases[iii] == altBases[iii] ) { iii++; } } - final ArrayList mergedAlleles = new ArrayList(); + final List mergedAlleles = new ArrayList(); mergedAlleles.add( Allele.create( ArrayUtils.subarray(refBases, iii, refBases.length), true ) ); mergedAlleles.add( Allele.create( ArrayUtils.subarray(altBases, iii, altBases.length), false ) ); return new VariantContextBuilder("merged", thisVC.getChr(), thisVC.getStart() + iii, nextVC.getEnd(), mergedAlleles).make(); @@ -492,10 +492,10 @@ public class GenotypingEngine { eventMapper.put(new Event(vc), new ArrayList()); } - final ArrayList undeterminedHaplotypes = new ArrayList(haplotypes.size()); + final List undeterminedHaplotypes = new ArrayList(haplotypes.size()); for( final Haplotype h : haplotypes ) { if( h.isArtificialHaplotype() && loc == h.getArtificialAllelePosition() ) { - final ArrayList alleles = new ArrayList(2); + final List alleles = new ArrayList(2); alleles.add(h.getArtificialRefAllele()); alleles.add(h.getArtificialAltAllele()); final Event artificialVC = new Event( (new VariantContextBuilder()).source("artificialHaplotype") @@ -572,13 +572,13 @@ public class GenotypingEngine { } @Ensures({"result.size() == haplotypeAllelesForSample.size()"}) - protected static List findEventAllelesInSample( final List eventAlleles, final List haplotypeAlleles, final List haplotypeAllelesForSample, final ArrayList> alleleMapper, final ArrayList haplotypes ) { + protected static List findEventAllelesInSample( final List eventAlleles, final List haplotypeAlleles, final List haplotypeAllelesForSample, final List> alleleMapper, final List haplotypes ) { if( haplotypeAllelesForSample.contains(Allele.NO_CALL) ) { return noCall; } - final ArrayList eventAllelesForSample = new ArrayList(); + final List eventAllelesForSample = new ArrayList(); for( final Allele a : haplotypeAllelesForSample ) { final Haplotype haplotype = haplotypes.get(haplotypeAlleles.indexOf(a)); for( int iii = 0; iii < alleleMapper.size(); iii++ ) { - final ArrayList mappedHaplotypes = alleleMapper.get(iii); + final List mappedHaplotypes = alleleMapper.get(iii); if( mappedHaplotypes.contains(haplotype) ) { eventAllelesForSample.add(eventAlleles.get(iii)); break; @@ -597,8 +597,8 @@ public class GenotypingEngine { return false; } - protected static HashMap generateVCsFromAlignment( final Haplotype haplotype, final int alignmentStartHapwrtRef, final Cigar cigar, final byte[] ref, final byte[] alignment, final GenomeLoc refLoc, final String sourceNameToAdd ) { - final HashMap vcs = new HashMap(); + protected static Map generateVCsFromAlignment( final Haplotype haplotype, final int alignmentStartHapwrtRef, final Cigar cigar, final byte[] ref, final byte[] alignment, final GenomeLoc refLoc, final String sourceNameToAdd ) { + final Map vcs = new HashMap(); int refPos = alignmentStartHapwrtRef; if( refPos < 0 ) { return null; } // Protection against SW failures @@ -609,7 +609,7 @@ public class GenotypingEngine { switch( ce.getOperator() ) { case I: { - final ArrayList insertionAlleles = new ArrayList(); + final List insertionAlleles = new ArrayList(); final int insertionStart = refLoc.getStart() + refPos - 1; final byte refByte = ref[refPos-1]; if( BaseUtils.isRegularBase(refByte) ) { @@ -639,7 +639,7 @@ public class GenotypingEngine { case D: { final byte[] deletionBases = Arrays.copyOfRange( ref, refPos - 1, refPos + elementLength ); // add padding base - final ArrayList deletionAlleles = new ArrayList(); + final List deletionAlleles = new ArrayList(); final int deletionStart = refLoc.getStart() + refPos - 1; // BUGBUG: how often does this symbolic deletion allele case happen? //if( haplotype != null && ( (haplotype.leftBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() + elementLength - 1 >= deletionStart && haplotype.leftBreakPoint + alignmentStartHapwrtRef + refLoc.getStart() + elementLength - 1 < deletionStart + elementLength) @@ -667,7 +667,7 @@ public class GenotypingEngine { final byte altByte = alignment[alignmentPos]; if( refByte != altByte ) { // SNP! if( BaseUtils.isRegularBase(refByte) && BaseUtils.isRegularBase(altByte) ) { - final ArrayList snpAlleles = new ArrayList(); + final List snpAlleles = new ArrayList(); snpAlleles.add( Allele.create( refByte, true ) ); snpAlleles.add( Allele.create( altByte, false ) ); vcs.put(refLoc.getStart() + refPos, new VariantContextBuilder(sourceNameToAdd, refLoc.getContig(), refLoc.getStart() + refPos, refLoc.getStart() + refPos, snpAlleles).make()); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java index 6cfbc3830..027c62e68 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCaller.java @@ -274,10 +274,10 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // bases with quality less than or equal to this value are trimmed off the tails of the reads private static final byte MIN_TAIL_QUALITY = 20; - private ArrayList samplesList = new ArrayList(); + private List samplesList = new ArrayList(); private final static double LOG_ONE_HALF = -Math.log10(2.0); private final static double LOG_ONE_THIRD = -Math.log10(3.0); - private final ArrayList allelesToGenotype = new ArrayList(); + private final List allelesToGenotype = new ArrayList(); private final static Allele FAKE_REF_ALLELE = Allele.create("N", true); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file private final static Allele FAKE_ALT_ALLELE = Allele.create("", false); // used in isActive function to call into UG Engine. Should never appear anywhere in a VCF file @@ -429,7 +429,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem genotypes.add( new GenotypeBuilder(sample.getKey()).alleles(noCall).PL(genotypeLikelihoods).make() ); } - final ArrayList alleles = new ArrayList(); + final List alleles = new ArrayList(); alleles.add( FAKE_REF_ALLELE ); alleles.add( FAKE_ALT_ALLELE ); final VariantCallContext vcOut = UG_engine_simple_genotyper.calculateGenotypes(new VariantContextBuilder("HCisActive!", context.getContig(), context.getLocation().getStart(), context.getLocation().getStop(), alleles).genotypes(genotypes).make(), GenotypeLikelihoodsCalculationModel.Model.INDEL); @@ -450,7 +450,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // we're benchmarking ART and/or the active region determination code in the HC, just leave without doing any work return 1; - final ArrayList activeAllelesToGenotype = new ArrayList(); + final List activeAllelesToGenotype = new ArrayList(); if( UG_engine.getUAC().GenotypingMode == GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ) { for( final VariantContext vc : allelesToGenotype ) { @@ -472,7 +472,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem final Haplotype referenceHaplotype = new Haplotype(activeRegion.getActiveRegionReference(referenceReader), true); // Create the reference haplotype which is the bases from the reference that make up the active region final byte[] fullReferenceWithPadding = activeRegion.getFullReference(referenceReader, REFERENCE_PADDING); - final ArrayList haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, fullSpanBeforeClipping, MIN_PRUNE_FACTOR, activeAllelesToGenotype ); + final List haplotypes = assemblyEngine.runLocalAssembly( activeRegion, referenceHaplotype, fullReferenceWithPadding, fullSpanBeforeClipping, MIN_PRUNE_FACTOR, activeAllelesToGenotype ); if( haplotypes.size() == 1 ) { return 1; } // only the reference haplotype remains so nothing else to do! activeRegion.hardClipToActiveRegion(); // only evaluate the parts of reads that are overlapping the active region @@ -484,10 +484,10 @@ public class HaplotypeCaller extends ActiveRegionWalker implem // evaluate each sample's reads against all haplotypes final Map stratifiedReadMap = likelihoodCalculationEngine.computeReadLikelihoods( haplotypes, splitReadsBySample( activeRegion.getReads() ) ); - final Map> perSampleFilteredReadList = splitReadsBySample( filteredReads ); + final Map> perSampleFilteredReadList = splitReadsBySample( filteredReads ); // subset down to only the best haplotypes to be genotyped in all samples ( in GGA mode use all discovered haplotypes ) - final ArrayList bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? + final List bestHaplotypes = ( UG_engine.getUAC().GenotypingMode != GenotypeLikelihoodsCalculationModel.GENOTYPING_MODE.GENOTYPE_GIVEN_ALLELES ? likelihoodCalculationEngine.selectBestHaplotypes( haplotypes, stratifiedReadMap, maxNumHaplotypesInPopulation ) : haplotypes ); for( final VariantContext call : genotypingEngine.assignGenotypeLikelihoods( UG_engine, @@ -558,7 +558,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem private void finalizeActiveRegion( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) { if( DEBUG ) { System.out.println("\nAssembling " + activeRegion.getLocation() + " with " + activeRegion.size() + " reads: (with overlap region = " + activeRegion.getExtendedLoc() + ")"); } - final ArrayList finalizedReadList = new ArrayList(); + final List finalizedReadList = new ArrayList(); final FragmentCollection fragmentCollection = FragmentUtils.create( activeRegion.getReads() ); activeRegion.clearReads(); @@ -569,7 +569,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } // Loop through the reads hard clipping the adaptor and low quality tails - final ArrayList readsToUse = new ArrayList(finalizedReadList.size()); + final List readsToUse = new ArrayList(finalizedReadList.size()); for( final GATKSAMRecord myRead : finalizedReadList ) { final GATKSAMRecord postAdapterRead = ( myRead.getReadUnmappedFlag() ? myRead : ReadClipper.hardClipAdaptorSequence( myRead ) ); if( postAdapterRead != null && !postAdapterRead.isEmpty() && postAdapterRead.getCigar().getReadLength() > 0 ) { @@ -583,7 +583,7 @@ public class HaplotypeCaller extends ActiveRegionWalker implem } private List filterNonPassingReads( final org.broadinstitute.sting.utils.activeregion.ActiveRegion activeRegion ) { - final ArrayList readsToRemove = new ArrayList(); + final List readsToRemove = new ArrayList(); for( final GATKSAMRecord rec : activeRegion.getReads() ) { if( rec.getReadLength() < 24 || rec.getMappingQuality() < 20 || BadMateFilter.hasBadMate(rec) || (keepRG != null && !rec.getReadGroup().getId().equals(keepRG)) ) { readsToRemove.add(rec); @@ -599,10 +599,10 @@ public class HaplotypeCaller extends ActiveRegionWalker implem return getToolkit().getGenomeLocParser().createGenomeLoc(activeRegion.getReadSpanLoc().getContig(), padLeft, padRight); } - private HashMap> splitReadsBySample( final List reads ) { - final HashMap> returnMap = new HashMap>(); + private Map> splitReadsBySample( final List reads ) { + final Map> returnMap = new HashMap>(); for( final String sample : samplesList) { - ArrayList readList = returnMap.get( sample ); + List readList = returnMap.get( sample ); if( readList == null ) { readList = new ArrayList(); returnMap.put(sample, readList); diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java index 57e071189..655b3e529 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LikelihoodCalculationEngine.java @@ -91,11 +91,11 @@ public class LikelihoodCalculationEngine { DEBUG = debug; } - public Map computeReadLikelihoods( final ArrayList haplotypes, final HashMap> perSampleReadList ) { + public Map computeReadLikelihoods( final List haplotypes, final Map> perSampleReadList ) { final Map stratifiedReadMap = new HashMap(); int X_METRIC_LENGTH = 0; - for( final Map.Entry> sample : perSampleReadList.entrySet() ) { + for( final Map.Entry> sample : perSampleReadList.entrySet() ) { for( final GATKSAMRecord read : sample.getValue() ) { final int readLength = read.getReadLength(); if( readLength > X_METRIC_LENGTH ) { X_METRIC_LENGTH = readLength; } @@ -115,7 +115,7 @@ public class LikelihoodCalculationEngine { pairHMM.initialize(X_METRIC_LENGTH, Y_METRIC_LENGTH); // for each sample's reads - for( final Map.Entry> sampleEntry : perSampleReadList.entrySet() ) { + for( final Map.Entry> sampleEntry : perSampleReadList.entrySet() ) { //if( DEBUG ) { System.out.println("Evaluating sample " + sample + " with " + perSampleReadList.get( sample ).size() + " passing reads"); } // evaluate the likelihood of the reads given those haplotypes stratifiedReadMap.put(sampleEntry.getKey(), computeReadLikelihoods(haplotypes, sampleEntry.getValue())); @@ -123,7 +123,7 @@ public class LikelihoodCalculationEngine { return stratifiedReadMap; } - private PerReadAlleleLikelihoodMap computeReadLikelihoods( final ArrayList haplotypes, final ArrayList reads) { + private PerReadAlleleLikelihoodMap computeReadLikelihoods( final List haplotypes, final List reads) { // first, a little set up to get copies of the Haplotypes that are Alleles (more efficient than creating them each time) final int numHaplotypes = haplotypes.size(); final Map alleleVersions = new HashMap(numHaplotypes); @@ -235,72 +235,13 @@ public class LikelihoodCalculationEngine { return likelihoodMatrix; } - /* @Requires({"haplotypes.size() > 0"}) @Ensures({"result.size() <= haplotypes.size()"}) - public ArrayList selectBestHaplotypes( final ArrayList haplotypes ) { - - // BUGBUG: This function needs a lot of work. Need to use 4-gamete test or Tajima's D to decide to break up events into separate pieces for genotyping - - final int numHaplotypes = haplotypes.size(); - final Set sampleKeySet = haplotypes.get(0).getSampleKeySet(); // BUGBUG: assume all haplotypes saw the same samples - final ArrayList bestHaplotypesIndexList = new ArrayList(); - bestHaplotypesIndexList.add(0); // always start with the reference haplotype - final double[][][] haplotypeLikelihoodMatrix = new double[sampleKeySet.size()][numHaplotypes][numHaplotypes]; - - int sampleCount = 0; - for( final String sample : sampleKeySet ) { - haplotypeLikelihoodMatrix[sampleCount++] = computeDiploidHaplotypeLikelihoods( haplotypes, sample ); - } - - int hap1 = 0; - int hap2 = 0; - int chosenSample = 0; - //double bestElement = Double.NEGATIVE_INFINITY; - final int maxChosenHaplotypes = Math.min( 15, sampleKeySet.size() * 2 + 1 ); - while( bestHaplotypesIndexList.size() < maxChosenHaplotypes ) { - double maxElement = Double.NEGATIVE_INFINITY; - for( int kkk = 0; kkk < sampleCount; kkk++ ) { - for( int iii = 0; iii < numHaplotypes; iii++ ) { - for( int jjj = 0; jjj <= iii; jjj++ ) { - if( haplotypeLikelihoodMatrix[kkk][iii][jjj] > maxElement ) { - maxElement = haplotypeLikelihoodMatrix[kkk][iii][jjj]; - hap1 = iii; - hap2 = jjj; - chosenSample = kkk; - } - } - } - } - if( maxElement == Double.NEGATIVE_INFINITY ) { break; } - - if( !bestHaplotypesIndexList.contains(hap1) ) { bestHaplotypesIndexList.add(hap1); } - if( !bestHaplotypesIndexList.contains(hap2) ) { bestHaplotypesIndexList.add(hap2); } - - for( int iii = 0; iii < numHaplotypes; iii++ ) { - for( int jjj = 0; jjj <= iii; jjj++ ) { - haplotypeLikelihoodMatrix[chosenSample][iii][jjj] = Double.NEGATIVE_INFINITY; - } - } - } - - if( DEBUG ) { System.out.println("Chose " + (bestHaplotypesIndexList.size() - 1) + " alternate haplotypes to genotype in all samples."); } - - final ArrayList bestHaplotypes = new ArrayList(); - for( final int hIndex : bestHaplotypesIndexList ) { - bestHaplotypes.add( haplotypes.get(hIndex) ); - } - return bestHaplotypes; - } - */ - - @Requires({"haplotypes.size() > 0"}) - @Ensures({"result.size() <= haplotypes.size()"}) - public ArrayList selectBestHaplotypes( final ArrayList haplotypes, final Map stratifiedReadMap, final int maxNumHaplotypesInPopulation ) { + public List selectBestHaplotypes( final List haplotypes, final Map stratifiedReadMap, final int maxNumHaplotypesInPopulation ) { final int numHaplotypes = haplotypes.size(); final Set sampleKeySet = stratifiedReadMap.keySet(); - final ArrayList bestHaplotypesIndexList = new ArrayList(); + final List bestHaplotypesIndexList = new ArrayList(); bestHaplotypesIndexList.add( findReferenceIndex(haplotypes) ); // always start with the reference haplotype final List haplotypesAsAlleles = new ArrayList(); for( final Haplotype h : haplotypes ) { haplotypesAsAlleles.add(Allele.create(h.getBases())); } @@ -332,7 +273,7 @@ public class LikelihoodCalculationEngine { if( DEBUG ) { System.out.println("Chose " + (bestHaplotypesIndexList.size() - 1) + " alternate haplotypes to genotype in all samples."); } - final ArrayList bestHaplotypes = new ArrayList(); + final List bestHaplotypes = new ArrayList(); for( final int hIndex : bestHaplotypesIndexList ) { bestHaplotypes.add( haplotypes.get(hIndex) ); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java index b0e340dc2..3efa342b1 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/LocalAssemblyEngine.java @@ -51,7 +51,7 @@ import org.broadinstitute.sting.utils.Haplotype; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.variant.variantcontext.VariantContext; -import java.util.ArrayList; +import java.util.List; /** * Created by IntelliJ IDEA. @@ -67,5 +67,5 @@ public abstract class LocalAssemblyEngine { protected LocalAssemblyEngine() { } - public abstract ArrayList runLocalAssembly(ActiveRegion activeRegion, Haplotype refHaplotype, byte[] fullReferenceWithPadding, GenomeLoc refLoc, int PRUNE_FACTOR, ArrayList activeAllelesToGenotype); + public abstract List runLocalAssembly(ActiveRegion activeRegion, Haplotype refHaplotype, byte[] fullReferenceWithPadding, GenomeLoc refLoc, int PRUNE_FACTOR, List activeAllelesToGenotype); } diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java index e16994fa4..a9768557d 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/haplotypecaller/SimpleDeBruijnAssembler.java @@ -84,7 +84,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { private final boolean DEBUG; private final PrintStream GRAPH_WRITER; - private final ArrayList> graphs = new ArrayList>(); + private final List> graphs = new ArrayList>(); private final int MIN_KMER; private int PRUNE_FACTOR = 2; @@ -96,7 +96,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { MIN_KMER = minKmer; } - public ArrayList runLocalAssembly( final ActiveRegion activeRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final int PRUNE_FACTOR, final ArrayList activeAllelesToGenotype ) { + public List runLocalAssembly( final ActiveRegion activeRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final int PRUNE_FACTOR, final List activeAllelesToGenotype ) { this.PRUNE_FACTOR = PRUNE_FACTOR; // create the graphs @@ -168,7 +168,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { } protected static void pruneGraph( final DefaultDirectedGraph graph, final int pruneFactor ) { - final ArrayList edgesToRemove = new ArrayList(); + final List edgesToRemove = new ArrayList(); for( final DeBruijnEdge e : graph.edgeSet() ) { if( e.getMultiplicity() <= pruneFactor && !e.getIsRef() ) { // remove non-reference edges with weight less than or equal to the pruning factor edgesToRemove.add(e); @@ -177,7 +177,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { graph.removeAllEdges(edgesToRemove); // Run through the graph and clean up singular orphaned nodes - final ArrayList verticesToRemove = new ArrayList(); + final List verticesToRemove = new ArrayList(); for( final DeBruijnVertex v : graph.vertexSet() ) { if( graph.inDegreeOf(v) == 0 && graph.outDegreeOf(v) == 0 ) { verticesToRemove.add(v); @@ -187,7 +187,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { } protected static void eliminateNonRefPaths( final DefaultDirectedGraph graph ) { - final ArrayList verticesToRemove = new ArrayList(); + final List verticesToRemove = new ArrayList(); boolean done = false; while( !done ) { done = true; @@ -313,8 +313,8 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { } @Ensures({"result.contains(refHaplotype)"}) - private ArrayList findBestPaths( final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final ArrayList activeAllelesToGenotype, final GenomeLoc activeRegionWindow ) { - final ArrayList returnHaplotypes = new ArrayList(); + private List findBestPaths( final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final GenomeLoc refLoc, final List activeAllelesToGenotype, final GenomeLoc activeRegionWindow ) { + final List returnHaplotypes = new ArrayList(); // add the reference haplotype separately from all the others final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( fullReferenceWithPadding, refHaplotype.getBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND ); @@ -343,7 +343,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { // for GGA mode, add the desired allele into the haplotype if it isn't already present if( !activeAllelesToGenotype.isEmpty() ) { - final HashMap eventMap = GenotypingEngine.generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), fullReferenceWithPadding, h.getBases(), refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place + final Map eventMap = GenotypingEngine.generateVCsFromAlignment( h, h.getAlignmentStartHapwrtRef(), h.getCigar(), fullReferenceWithPadding, h.getBases(), refLoc, "HCassembly" ); // BUGBUG: need to put this function in a shared place for( final VariantContext compVC : activeAllelesToGenotype ) { // for GGA mode, add the desired allele into the haplotype if it isn't already present final VariantContext vcOnHaplotype = eventMap.get(compVC.getStart()); @@ -378,7 +378,7 @@ public class SimpleDeBruijnAssembler extends LocalAssemblyEngine { return returnHaplotypes; } - private boolean addHaplotype( final Haplotype haplotype, final byte[] ref, final ArrayList haplotypeList, final int activeRegionStart, final int activeRegionStop, final boolean FORCE_INCLUSION_FOR_GGA_MODE ) { + private boolean addHaplotype( final Haplotype haplotype, final byte[] ref, final List haplotypeList, final int activeRegionStart, final int activeRegionStop, final boolean FORCE_INCLUSION_FOR_GGA_MODE ) { if( haplotype == null ) { return false; } final SWPairwiseAlignment swConsensus = new SWPairwiseAlignment( ref, haplotype.getBases(), SW_MATCH, SW_MISMATCH, SW_GAP, SW_GAP_EXTEND ); diff --git a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java index baab1f5fa..6e8a412c3 100644 --- a/public/java/src/org/broadinstitute/sting/utils/Haplotype.java +++ b/public/java/src/org/broadinstitute/sting/utils/Haplotype.java @@ -40,7 +40,7 @@ import java.util.*; public class Haplotype extends Allele { private GenomeLoc genomeLocation = null; - private HashMap eventMap = null; + private Map eventMap = null; private Cigar cigar; private int alignmentStartHapwrtRef; public int leftBreakPoint = 0; @@ -81,11 +81,11 @@ public class Haplotype extends Allele { return Arrays.hashCode(getBases()); } - public HashMap getEventMap() { + public Map getEventMap() { return eventMap; } - public void setEventMap( final HashMap eventMap ) { + public void setEventMap( final Map eventMap ) { this.eventMap = eventMap; } diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index 13add5e7d..dd6735d89 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -67,7 +67,7 @@ public class ActiveRegion implements HasGenomeLocation { * The reads included in this active region. May be empty upon creation, and expand / contract * as reads are added or removed from this region. */ - private final ArrayList reads = new ArrayList(); + private final List reads = new ArrayList(); /** * An ordered list (by genomic coordinate) of the ActivityProfileStates that went @@ -355,7 +355,7 @@ public class ActiveRegion implements HasGenomeLocation { * read coordinates. */ public void hardClipToActiveRegion() { - final ArrayList clippedReads = ReadClipper.hardClipToRegion( reads, extendedLoc.getStart(), extendedLoc.getStop() ); + final List clippedReads = ReadClipper.hardClipToRegion( reads, extendedLoc.getStart(), extendedLoc.getStop() ); ReadUtils.sortReadsByCoordinate(clippedReads); clearReads(); addAll(clippedReads); diff --git a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java index 87526545d..45dd55af7 100644 --- a/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java +++ b/public/java/src/org/broadinstitute/sting/utils/clipping/ReadClipper.java @@ -362,8 +362,8 @@ public class ReadClipper { return GATKSAMRecord.emptyRead(read); } - public static ArrayList hardClipToRegion( final ArrayList reads, final int refStart, final int refStop ) { - final ArrayList returnList = new ArrayList( reads.size() ); + public static List hardClipToRegion( final List reads, final int refStart, final int refStop ) { + final List returnList = new ArrayList( reads.size() ); for( final GATKSAMRecord read : reads ) { final GATKSAMRecord clippedRead = hardClipToRegion( read, refStart, refStop ); if( !clippedRead.isEmpty() ) {