From c1ba12d71a7fa0879dcd4e5a8fcadbd6ece025de Mon Sep 17 00:00:00 2001 From: Tad Jordan Date: Thu, 3 Jan 2013 16:25:57 -0500 Subject: [PATCH 01/30] Added unit test for outputting sorted GATKReport Tables - Made few small modifications to code - Replaced the two arguments in GATKReportTable constructor with an enum used to specify way of sorting the table --- .../sting/gatk/report/GATKReport.java | 9 +- .../sting/gatk/report/GATKReportTable.java | 187 +++++++++--------- .../bqsr/RecalibrationArgumentCollection.java | 2 +- .../diagnostics/ErrorRatePerCycle.java | 2 +- .../varianteval/VariantEvalReportWriter.java | 2 +- .../utils/recalibration/QuantizationInfo.java | 2 +- .../sting/utils/recalibration/RecalUtils.java | 2 +- .../sting/gatk/report/GATKReportUnitTest.java | 88 ++++++++- 8 files changed, 185 insertions(+), 109 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java index e69924930..1451b8cde 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java @@ -117,7 +117,7 @@ public class GATKReport { * @param numColumns the number of columns in this table */ public void addTable(final String tableName, final String tableDescription, final int numColumns) { - addTable(tableName, tableDescription, numColumns, false, false); + addTable(tableName, tableDescription, numColumns, GATKReportTable.TableSortingWay.DO_NOT_SORT); } /** @@ -126,11 +126,10 @@ public class GATKReport { * @param tableName the name of the table * @param tableDescription the description of the table * @param numColumns the number of columns in this table - * @param sortByRowID whether to sort the rows by the row ID - * @param sortByAllColumns whether to sort the rows by all columns starting from leftmost column + * @param sortingWay way to sort table */ - public void addTable(final String tableName, final String tableDescription, final int numColumns, final boolean sortByRowID, final boolean sortByAllColumns) { - GATKReportTable table = new GATKReportTable(tableName, tableDescription, numColumns, sortByRowID, sortByAllColumns); + public void addTable(final String tableName, final String tableDescription, final int numColumns, final GATKReportTable.TableSortingWay sortingWay) { + GATKReportTable table = new GATKReportTable(tableName, tableDescription, numColumns, sortingWay); tables.put(tableName, table); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java index 2bf7c9609..226e50f81 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java @@ -46,8 +46,7 @@ public class GATKReportTable { private final String tableName; private final String tableDescription; - private final boolean sortByRowID; - private final boolean sortByAllColumns; + private final TableSortingWay sortingWay; private List underlyingData; private final List columnInfo; @@ -73,6 +72,12 @@ public class GATKReportTable { public int index() { return index; } } + public enum TableSortingWay { + SORT_BY_ROW, + SORT_BY_COLUMN, + DO_NOT_SORT + } + protected enum TableNameHeaderFields { NAME(2), DESCRIPTION(3); @@ -107,10 +112,7 @@ public class GATKReportTable { tableDescription = (tableNameData.length <= TableNameHeaderFields.DESCRIPTION.index()) ? "" : tableNameData[TableNameHeaderFields.DESCRIPTION.index()]; // table may have no description! (and that's okay) // when reading from a file, we do not re-sort the rows - sortByRowID = false; - - // when reading from a file, we do not re-sort the rows - sortByAllColumns = false; + sortingWay = TableSortingWay.DO_NOT_SORT; // initialize the data final int nColumns = Integer.parseInt(tableData[TableDataHeaderFields.COLS.index()]); @@ -181,7 +183,7 @@ public class GATKReportTable { * @param numColumns the number of columns in this table */ public GATKReportTable(final String tableName, final String tableDescription, final int numColumns) { - this(tableName, tableDescription, numColumns, true, false); + this(tableName, tableDescription, numColumns, TableSortingWay.SORT_BY_ROW); } /** @@ -190,10 +192,9 @@ public class GATKReportTable { * @param tableName the name of the table * @param tableDescription the description of the table * @param numColumns the number of columns in this table - * @param sortByRowID whether to sort rows by the row ID (instead of the order in which they were added) - * @param sortByAllColumns whether to sort rows by all columns (instead of the order in which they were added) + * @param sortingWay in what way to sort rows (instead of the order in which they were added) */ - public GATKReportTable(final String tableName, final String tableDescription, final int numColumns, final boolean sortByRowID, final boolean sortByAllColumns) { + public GATKReportTable(final String tableName, final String tableDescription, final int numColumns, final TableSortingWay sortingWay) { if ( !isValidName(tableName) ) { throw new ReviewedStingException("Attempted to set a GATKReportTable name of '" + tableName + "'. GATKReportTable names must be purely alphanumeric - no spaces or special characters are allowed."); } @@ -204,8 +205,7 @@ public class GATKReportTable { this.tableName = tableName; this.tableDescription = tableDescription; - this.sortByRowID = sortByRowID; - this.sortByAllColumns = sortByAllColumns; + this.sortingWay = sortingWay; underlyingData = new ArrayList(INITITAL_ARRAY_SIZE); columnInfo = new ArrayList(numColumns); @@ -218,7 +218,7 @@ public class GATKReportTable { * @param tableToCopy */ public GATKReportTable(final GATKReportTable tableToCopy, final boolean copyData) { - this(tableToCopy.getTableName(), tableToCopy.getTableDescription(), tableToCopy.getNumColumns(), tableToCopy.sortByRowID, tableToCopy.sortByAllColumns); + this(tableToCopy.getTableName(), tableToCopy.getTableDescription(), tableToCopy.getNumColumns(), tableToCopy.sortingWay); for ( final GATKReportColumn column : tableToCopy.getColumnInfo() ) addColumn(column.getColumnName(), column.getFormat()); if ( copyData ) @@ -569,56 +569,53 @@ public class GATKReportTable { out.println(); // write the table body - if ( sortByAllColumns ) { - Collections.sort(underlyingData, new Comparator() { - //INVARIANT the two arrays are of the same length and corresponding elements are of the same type - @Override - public int compare(Object[] objectArr1, Object[] objectArr2) { - final int EQUAL = 0; + switch (sortingWay) { + case SORT_BY_COLUMN: + Collections.sort(underlyingData, new Comparator() { + //INVARIANT the two arrays are of the same length and corresponding elements are of the same type + @Override + public int compare(Object[] objectArr1, Object[] objectArr2) { + final int EQUAL = 0; - int result = EQUAL; + int result = EQUAL; - int l = objectArr1.length; - for (int x = 0; x < l; x++) { - if (objectArr1[x] instanceof Integer) { - result = ((Integer)objectArr1[x]).compareTo((Integer)objectArr2[x]); + int l = objectArr1.length; + for (int x = 0; x < l; x++) { + if (objectArr1[x] instanceof Integer) { + result = ((Integer)objectArr1[x]).compareTo((Integer)objectArr2[x]); + } else if (objectArr1[x] instanceof Double) { + result = ((Double)objectArr1[x]).compareTo((Double)objectArr2[x]); + } else { // default uses String comparison + result = objectArr1[x].toString().compareTo(objectArr2[x].toString()); + } if( result != EQUAL) { return result; } - } else if (objectArr1[x] instanceof Double) { - result = ((Double)objectArr1[x]).compareTo((Double)objectArr2[x]); - if( result != EQUAL) { - return result; - } - } else { // default uses String comparison - result = objectArr1[x].toString().compareTo(objectArr2[x].toString()); - if( result != EQUAL) { - return result; - } } + return result; } - return result; - } - }); - for ( final Object[] row : underlyingData ) - writeRow(out, row); - } else if ( sortByRowID ) { - // make sure that there are exactly the correct number of ID mappings - if ( rowIdToIndex.size() != underlyingData.size() ) - throw new ReviewedStingException("There isn't a 1-to-1 mapping from row ID to index; this can happen when rows are not created consistently"); + }); + for ( final Object[] row : underlyingData ) + writeRow(out, row); + break; + case SORT_BY_ROW: + // make sure that there are exactly the correct number of ID mappings + if ( rowIdToIndex.size() != underlyingData.size() ) + throw new ReviewedStingException("There isn't a 1-to-1 mapping from row ID to index; this can happen when rows are not created consistently"); - final TreeMap sortedMap; - try { - sortedMap = new TreeMap(rowIdToIndex); - } catch (ClassCastException e) { - throw new ReviewedStingException("Unable to sort the rows based on the row IDs because the ID Objects are of different types"); - } - for ( final Map.Entry rowKey : sortedMap.entrySet() ) - writeRow(out, underlyingData.get(rowKey.getValue())); - } else { - for ( final Object[] row : underlyingData ) - writeRow(out, row); - } + final TreeMap sortedMap; + try { + sortedMap = new TreeMap(rowIdToIndex); + } catch (ClassCastException e) { + throw new ReviewedStingException("Unable to sort the rows based on the row IDs because the ID Objects are of different types"); + } + for ( final Map.Entry rowKey : sortedMap.entrySet() ) + writeRow(out, underlyingData.get(rowKey.getValue())); + break; + case DO_NOT_SORT: + for ( final Object[] row : underlyingData ) + writeRow(out, row); + } out.println(); } @@ -735,53 +732,47 @@ public class GATKReportTable { } private List getOrderedRows() { - if ( sortByAllColumns ) { - Collections.sort(underlyingData, new Comparator() { - //INVARIANT the two arrays are of the same length and corresponding elements are of the same type - @Override - public int compare(Object[] objectArr1, Object[] objectArr2) { - final int EQUAL = 0; - int result = EQUAL; - - int l = objectArr1.length; - for (int x = 0; x < l; x++) { - if (objectArr1[x] instanceof Integer) { - result = ((Integer)objectArr1[x]).compareTo((Integer)objectArr2[x]); - if( result != EQUAL) { - return result; + switch (sortingWay) { + case SORT_BY_COLUMN: + Collections.sort(underlyingData, new Comparator() { + //INVARIANT the two arrays are of the same length and corresponding elements are of the same type + @Override + public int compare(Object[] objectArr1, Object[] objectArr2) { + final int EQUAL = 0; + int result = EQUAL; + int l = objectArr1.length; + for (int x = 0; x < l; x++) { + if (objectArr1[x] instanceof Integer) { + result = ((Integer)objectArr1[x]).compareTo((Integer)objectArr2[x]); + } else if (objectArr1[x] instanceof Double) { + result = ((Double)objectArr1[x]).compareTo((Double)objectArr2[x]); + } else { // default uses String comparison + result = objectArr1[x].toString().compareTo(objectArr2[x].toString()); + } + if( result != EQUAL) { + return result; + } } - } else if (objectArr1[x] instanceof Double) { - result = ((Double)objectArr1[x]).compareTo((Double)objectArr2[x]); - if( result != EQUAL) { - return result; - } - } else { // default uses String comparison - result = objectArr1[x].toString().compareTo(objectArr2[x].toString()); - if( result != EQUAL) { - return result; - } - } + return result; } - return result; + }); + return underlyingData; + case SORT_BY_ROW: + final TreeMap sortedMap; + try { + sortedMap = new TreeMap(rowIdToIndex); + } catch (ClassCastException e) { + return underlyingData; } - }); - return underlyingData; - } else if ( !sortByRowID ) { - return underlyingData; + + final List orderedData = new ArrayList(underlyingData.size()); + for ( final int rowKey : sortedMap.values() ) + orderedData.add(underlyingData.get(rowKey)); + + return orderedData; + default: + return underlyingData; } - - final TreeMap sortedMap; - try { - sortedMap = new TreeMap(rowIdToIndex); - } catch (ClassCastException e) { - return underlyingData; - } - - final List orderedData = new ArrayList(underlyingData.size()); - for ( final int rowKey : sortedMap.values() ) - orderedData.add(underlyingData.get(rowKey)); - - return orderedData; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java index 2f0f976fa..622413b18 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java @@ -207,7 +207,7 @@ public class RecalibrationArgumentCollection { public GATKReportTable generateReportTable(final String covariateNames) { GATKReportTable argumentsTable; if(SORT_BY_ALL_COLUMNS) { - argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2, false, true); + argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2, GATKReportTable.TableSortingWay.SORT_BY_COLUMN); } else { argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java index b4e781e91..5972322f8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java @@ -124,7 +124,7 @@ public class ErrorRatePerCycle extends LocusWalker { public void initialize() { report = new GATKReport(); - report.addTable(reportName, reportDescription, 6, true, false); + report.addTable(reportName, reportDescription, 6, GATKReportTable.TableSortingWay.SORT_BY_ROW); table = report.getTable(reportName); table.addColumn("readgroup"); table.addColumn("cycle"); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java index 6af70811f..6dad128fe 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java @@ -162,7 +162,7 @@ public class VariantEvalReportWriter { // create the table final String tableName = ve.getSimpleName(); final String tableDesc = ve.getClass().getAnnotation(Analysis.class).description(); - report.addTable(tableName, tableDesc, 1 + stratifiers.size() + (scanner.hasMoltenField() ? 2 : datamap.size()), true, false); + report.addTable(tableName, tableDesc, 1 + stratifiers.size() + (scanner.hasMoltenField() ? 2 : datamap.size()), GATKReportTable.TableSortingWay.SORT_BY_ROW); // grab the table, and add the columns we need to it final GATKReportTable table = report.getTable(tableName); diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java index e0c1261fe..fc942499c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java @@ -70,7 +70,7 @@ public class QuantizationInfo { public GATKReportTable generateReportTable(boolean sortBycols) { GATKReportTable quantizedTable; if(sortBycols) { - quantizedTable = new GATKReportTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map", 3, false, true); + quantizedTable = new GATKReportTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map", 3, GATKReportTable.TableSortingWay.SORT_BY_COLUMN); } else { quantizedTable = new GATKReportTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map", 3); } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java index d4e781fdd..58327b924 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java @@ -287,7 +287,7 @@ public class RecalUtils { final GATKReportTable reportTable; if (tableIndex <= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index) { if(sortByCols) { - reportTable = new GATKReportTable("RecalTable" + reportTableIndex++, "", columnNames.size(), false, true); + reportTable = new GATKReportTable("RecalTable" + reportTableIndex++, "", columnNames.size(), GATKReportTable.TableSortingWay.SORT_BY_COLUMN); } else { reportTable = new GATKReportTable("RecalTable" + reportTableIndex++, "", columnNames.size()); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java index d20b70b42..40d8d8ff9 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java @@ -32,6 +32,13 @@ import org.testng.annotations.Test; import java.io.File; import java.io.IOException; import java.io.PrintStream; +import java.util.Random; +import java.io.FileInputStream; +import java.io.DataInputStream; +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.util.ArrayList; + public class GATKReportUnitTest extends BaseTest { @Test @@ -77,6 +84,85 @@ public class GATKReportUnitTest extends BaseTest { Assert.assertEquals(GATKReportColumn.isRightAlign(value), expected, "right align of '" + value + "'"); } + private GATKReportTable getTableWithRandomValues() { + Random number = new Random(123L); + final int VALUESRANGE = 10; + + GATKReport report = GATKReport.newSimpleReport("TableName", "col1", "col2", "col3"); + GATKReportTable table = new GATKReportTable("testSortingTable", "table with random values sorted by columns", 3, GATKReportTable.TableSortingWay.SORT_BY_COLUMN ); + + final int NUMROWS = 100; + for (int x = 0; x < NUMROWS; x++) { + report.addRow(number.nextInt(VALUESRANGE), number.nextInt(VALUESRANGE), number.nextInt(VALUESRANGE)); + } + return table; + } + + @Test(enabled = true) + public void testSortingByColumn() { + Assert.assertEquals(isSorted(getTableWithRandomValues()), true); + } + + private boolean isSorted(GATKReportTable table) { + boolean result = true; + File testingSortingTableFile = new File("myFile.txt"); + + try { + // Connect print stream to the output stream + PrintStream ps = new PrintStream(testingSortingTableFile); + table.write(ps); + ps.close(); + } + catch (Exception e){ + System.err.println ("Error: " + e.getMessage()); + } + + ArrayList rows = new ArrayList(); + try { + // Open the file + FileInputStream fStream = new FileInputStream(testingSortingTableFile); + // Get the object of DataInputStream + DataInputStream in = new DataInputStream(fStream); + BufferedReader br = new BufferedReader(new InputStreamReader(in)); + String strLine; + //Read File Line By Line + while ((strLine = br.readLine()) != null) { + + String[] parts = strLine.split(" "); + int l = parts.length; + int[] row = new int[l]; + for(int n = 0; n < l; n++) { + row[n] = Integer.parseInt(parts[n]); + } + rows.add(row); + } + //Close the input stream + in.close(); + } catch (Exception e){//Catch exception if any + System.err.println("Error: " + e.getMessage()); + } + for (int x = 1; x < rows.size() && result; x++) { + result = checkRowOrder(rows.get(x - 1), rows.get(x)); + } + return result; + } + + private boolean checkRowOrder(int[] row1, int[] row2) { + int l = row1.length; + final int EQUAL = 0; + + int result = EQUAL; + + for(int x = 0; x < l && ( result <= EQUAL); x++) { + result = ((Integer)row1[x]).compareTo(row2[x]); + } + if (result <= EQUAL) { + return true; + } else { + return false; + } + } + private GATKReportTable makeBasicTable() { GATKReport report = GATKReport.newSimpleReport("TableName", "sample", "value"); GATKReportTable table = report.getTable("TableName"); @@ -168,7 +254,7 @@ public class GATKReportUnitTest extends BaseTest { table.set("RZ", "SomeFloat", 535646345.657453464576); table.set("RZ", "TrueFalse", true); - report1.addTable("Table3", "blah", 1, true, false); + report1.addTable("Table3", "blah", 1, GATKReportTable.TableSortingWay.SORT_BY_ROW); report1.getTable("Table3").addColumn("a"); report1.getTable("Table3").addRowIDMapping("q", 2); report1.getTable("Table3").addRowIDMapping("5", 3); From 47e620dfbcc6c7b38f68d578780acecbeb1f81d0 Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Wed, 19 Dec 2012 14:55:34 -0500 Subject: [PATCH 02/30] Create BAM index to test shard boundaries --- .../TraverseActiveRegionsUnitTest.java | 34 +++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java index 4cda1455e..8f2f2be70 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java @@ -112,7 +112,6 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { dictionary = reference.getSequenceDictionary(); genomeLocParser = new GenomeLocParser(dictionary); - // TODO: test shard boundaries // TODO: reads with indels // TODO: reads which span many regions // TODO: reads which are partially between intervals (in/outside extension) @@ -142,6 +141,9 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { reads.add(buildSAMRecord("boundary_1_post", "1", 1999, 2050)); reads.add(buildSAMRecord("extended_and_np", "1", 990, 1990)); reads.add(buildSAMRecord("outside_intervals", "1", 5000, 6000)); + reads.add(buildSAMRecord("shard_boundary_1_pre", "1", 16300, 16385)); + reads.add(buildSAMRecord("shard_boundary_1_post", "1", 16384, 16400)); + reads.add(buildSAMRecord("shard_boundary_equal", "1", 16355, 16414)); reads.add(buildSAMRecord("simple20", "20", 10025, 10075)); createBAM(reads); @@ -153,7 +155,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { File indexFile = new File(testBAI); indexFile.deleteOnExit(); - SAMFileWriter out = new SAMFileWriterFactory().makeBAMWriter(reads.get(0).getHeader(), true, outFile); + SAMFileWriter out = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(reads.get(0).getHeader(), true, outFile); for (GATKSAMRecord read : ReadUtils.sortReadsByCoordinate(reads)) { out.addAlignment(read); } @@ -272,6 +274,9 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { // boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 // outside_intervals: none + // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 + // shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 + // shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 // simple20: Primary in 20:10000-10100 Map activeRegions = getActiveRegions(walker, intervals); @@ -286,6 +291,12 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); verifyReadMapping(region, "boundary_equal", "boundary_1_post"); + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384)); + verifyReadMapping(region, "shard_boundary_1_pre"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927)); + verifyReadMapping(region, "shard_boundary_1_post", "shard_boundary_equal"); + region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); verifyReadMapping(region, "simple20"); } @@ -309,6 +320,9 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { // boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 // outside_intervals: none + // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 + // shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 + // shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 // simple20: Primary in 20:10000-10100 Map activeRegions = getActiveRegions(walker, intervals); @@ -323,6 +337,12 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); verifyReadMapping(region, "boundary_equal", "boundary_unequal", "boundary_1_pre", "boundary_1_post"); + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384)); + verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927)); + verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); + region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); verifyReadMapping(region, "simple20"); } @@ -347,6 +367,9 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { // boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 // outside_intervals: none + // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 + // shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 + // shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 // simple20: Primary in 20:10000-10100 Map activeRegions = getActiveRegions(walker, intervals); @@ -361,6 +384,12 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post"); + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384)); + verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927)); + verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); + region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); verifyReadMapping(region, "simple20"); } @@ -429,6 +458,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { protected GATKSAMRecord buildSAMRecord(String readName, String contig, int alignmentStart, int alignmentEnd) { SAMFileHeader header = ArtificialSAMUtils.createDefaultReadGroup(new SAMFileHeader(), "test", "test"); header.setSequenceDictionary(dictionary); + header.setSortOrder(SAMFileHeader.SortOrder.coordinate); GATKSAMRecord record = new GATKSAMRecord(header); record.setReadName(readName); From ffbd4d85f2e0112b32df0bbba00330b00a0806cf Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Fri, 14 Dec 2012 17:37:29 -0500 Subject: [PATCH 03/30] No need to pass fields as parameters --- .../sting/gatk/traversals/TraverseActiveRegions.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 06fc01232..33323ba67 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -227,7 +227,7 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine reads, final Queue workQueue, final T sum, final ActiveRegionWalker walker ) { + private T processActiveRegion( final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker walker ) { final ArrayList placedReads = new ArrayList(); - for( final GATKSAMRecord read : reads ) { + for( final GATKSAMRecord read : myReads ) { final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); if( activeRegion.getLocation().overlapsP( readLoc ) ) { // The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region) @@ -278,7 +278,7 @@ public class TraverseActiveRegions extends TraversalEngine> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); From 4cc372f53b98dca13a27b6f7130315c87c8074e4 Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Fri, 21 Dec 2012 15:14:50 -0500 Subject: [PATCH 04/30] LocusShardDataProvider doesn't need its own GenomeLocParser --- .../gatk/datasources/providers/LocusShardDataProvider.java | 6 ------ 1 file changed, 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java index 55304da34..4888b9f41 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java @@ -22,11 +22,6 @@ public class LocusShardDataProvider extends ShardDataProvider { */ private final ReadProperties sourceInfo; - /** - * The parser, used to create and build new GenomeLocs. - */ - private final GenomeLocParser genomeLocParser; - /** * The particular locus for which data is provided. Should be contained within shard.getGenomeLocs(). */ @@ -45,7 +40,6 @@ public class LocusShardDataProvider extends ShardDataProvider { public LocusShardDataProvider(Shard shard, ReadProperties sourceInfo, GenomeLocParser genomeLocParser, GenomeLoc locus, LocusIterator locusIterator, IndexedFastaSequenceFile reference, Collection rods) { super(shard,genomeLocParser,reference,rods); this.sourceInfo = sourceInfo; - this.genomeLocParser = genomeLocParser; this.locus = locus; this.locusIterator = locusIterator; } From 14a3ac0e3cc227b7df7eb7dba860bfda5d8073e8 Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Wed, 2 Jan 2013 17:01:21 -0500 Subject: [PATCH 05/30] Enable the use of alternate shards --- .../arguments/GATKArgumentCollection.java | 10 + .../gatk/executive/LinearMicroScheduler.java | 10 + .../sting/gatk/executive/MicroScheduler.java | 8 +- ...ctiveRegionShardTraverseActiveRegions.java | 290 ++++++++++++++++++ ...imentalReadShardTraverseActiveRegions.java | 290 ++++++++++++++++++ .../utils/activeregion/ActiveRegion.java | 1 + .../ExperimentalActiveRegionShardType.java | 14 + .../TraverseActiveRegionsUnitTest.java | 5 + 8 files changed, 627 insertions(+), 1 deletion(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalActiveRegionShardTraverseActiveRegions.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalReadShardTraverseActiveRegions.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/activeregion/ExperimentalActiveRegionShardType.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index d9c7c9008..beaeacc85 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -37,6 +37,7 @@ import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; import org.broadinstitute.sting.gatk.samples.PedigreeValidationType; import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.activeregion.ExperimentalActiveRegionShardType; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalSetRule; @@ -448,5 +449,14 @@ public class GATKArgumentCollection { @Hidden public boolean generateShadowBCF = false; // TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed + + // -------------------------------------------------------------------------------------------------------------- + // + // Experimental Active Region Traversal modes + // + // -------------------------------------------------------------------------------------------------------------- + + @Argument(fullName = "active_region_traversal_shard_type", shortName = "active_region_traversal_shard_type", doc = "Choose an experimental shard type for active region traversal, instead of the default LocusShard", required = false) + public ExperimentalActiveRegionShardType activeRegionShardType = ExperimentalActiveRegionShardType.LOCUSSHARD; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index f3c1ae91c..84d975879 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -11,6 +11,8 @@ import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.io.DirectOutputTracker; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; +import org.broadinstitute.sting.gatk.traversals.ExperimentalActiveRegionShardTraverseActiveRegions; +import org.broadinstitute.sting.gatk.traversals.ExperimentalReadShardTraverseActiveRegions; import org.broadinstitute.sting.gatk.traversals.TraversalEngine; import org.broadinstitute.sting.gatk.traversals.TraverseActiveRegions; import org.broadinstitute.sting.gatk.walkers.Walker; @@ -93,6 +95,14 @@ public class LinearMicroScheduler extends MicroScheduler { final Object result = ((TraverseActiveRegions) traversalEngine).endTraversal(walker, accumulator.getReduceInit()); accumulator.accumulate(null, result); // Assumes only used with StandardAccumulator } + else if( traversalEngine instanceof ExperimentalReadShardTraverseActiveRegions ) { + final Object result = ((ExperimentalReadShardTraverseActiveRegions) traversalEngine).endTraversal(walker, accumulator.getReduceInit()); + accumulator.accumulate(null, result); // Assumes only used with StandardAccumulator + } + else if( traversalEngine instanceof ExperimentalActiveRegionShardTraverseActiveRegions) { + final Object result = ((ExperimentalActiveRegionShardTraverseActiveRegions) traversalEngine).endTraversal(walker, accumulator.getReduceInit()); + accumulator.accumulate(null, result); // Assumes only used with StandardAccumulator + } Object result = accumulator.finishTraversal(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index f8aec1489..13c11def6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -41,6 +41,7 @@ import org.broadinstitute.sting.gatk.traversals.*; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.AutoFormattingTime; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.activeregion.ExperimentalActiveRegionShardType; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.progressmeter.ProgressMeter; @@ -245,7 +246,12 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { } else if (walker instanceof ReadPairWalker) { return new TraverseReadPairs(); } else if (walker instanceof ActiveRegionWalker) { - return new TraverseActiveRegions(); + switch (engine.getArguments().activeRegionShardType) { + case LOCUSSHARD: return new TraverseActiveRegions(); + case READSHARD: return new ExperimentalReadShardTraverseActiveRegions(); + case ACTIVEREGIONSHARD: return new ExperimentalActiveRegionShardTraverseActiveRegions(); + default: throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type of ActiveRegionWalker."); + } } else { throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type."); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalActiveRegionShardTraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalActiveRegionShardTraverseActiveRegions.java new file mode 100644 index 000000000..71cb89ad9 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalActiveRegionShardTraverseActiveRegions.java @@ -0,0 +1,290 @@ +package org.broadinstitute.sting.gatk.traversals; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.WalkerManager; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.providers.*; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension; +import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.activeregion.ActivityProfile; +import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.*; + +public class ExperimentalActiveRegionShardTraverseActiveRegions extends TraversalEngine,LocusShardDataProvider> { + /** + * our log, which we want to capture anything from this class + */ + protected final static Logger logger = Logger.getLogger(TraversalEngine.class); + + private final LinkedList workQueue = new LinkedList(); + private final LinkedHashSet myReads = new LinkedHashSet(); + + @Override + public String getTraversalUnits() { + return "active regions"; + } + + @Override + public T traverse( final ActiveRegionWalker walker, + final LocusShardDataProvider dataProvider, + T sum) { + logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider)); + + final LocusView locusView = new AllLocusView(dataProvider); + + final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); + final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension(); + final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion(); + + int minStart = Integer.MAX_VALUE; + final List activeRegions = new LinkedList(); + ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); + + ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); + + // We keep processing while the next reference location is within the interval + GenomeLoc prevLoc = null; + while( locusView.hasNext() ) { + final AlignmentContext locus = locusView.next(); + final GenomeLoc location = locus.getLocation(); + + // Grab all the previously unseen reads from this pileup and add them to the massive read list + // Note that this must occur before we leave because we are outside the intervals because + // reads may occur outside our intervals but overlap them in the future + // TODO -- this whole HashSet logic should be changed to a linked list of reads with + // TODO -- subsequent pass over them to find the ones overlapping the active regions + for( final PileupElement p : locus.getBasePileup() ) { + final GATKSAMRecord read = p.getRead(); + if( !myReads.contains(read) ) { + myReads.add(read); + } + + // If this is the last pileup for this shard calculate the minimum alignment start so that we know + // which active regions in the work queue are now safe to process + minStart = Math.min(minStart, read.getAlignmentStart()); + } + + // skip this location -- it's not part of our engine intervals + if ( outsideEngineIntervals(location) ) + continue; + + if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) { + // we've move across some interval boundary, restart profile + profile = incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); + } + + dataProvider.getShard().getReadMetrics().incrementNumIterations(); + + // create reference context. Note that if we have a pileup of "extended events", the context will + // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). + final ReferenceContext refContext = referenceView.getReferenceContext(location); + + // Iterate forward to get all reference ordered data covering this location + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); + + // Call the walkers isActive function for this locus and add them to the list to be integrated later + profile.add(walkerActiveProb(walker, tracker, refContext, locus, location)); + + prevLoc = location; + + printProgress(locus.getLocation()); + } + + updateCumulativeMetrics(dataProvider.getShard()); + + if ( ! profile.isEmpty() ) + incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); + + // add active regions to queue of regions to process + // first check if can merge active regions over shard boundaries + if( !activeRegions.isEmpty() ) { + if( !workQueue.isEmpty() ) { + final ActiveRegion last = workQueue.getLast(); + final ActiveRegion first = activeRegions.get(0); + if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) { + workQueue.removeLast(); + activeRegions.remove(first); + workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) ); + } + } + workQueue.addAll( activeRegions ); + } + + logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); + + // now go and process all of the active regions + sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig()); + + return sum; + } + + /** + * Is the loc outside of the intervals being requested for processing by the GATK? + * @param loc + * @return + */ + private boolean outsideEngineIntervals(final GenomeLoc loc) { + return engine.getIntervals() != null && ! engine.getIntervals().overlaps(loc); + } + + /** + * Take the individual isActive calls and integrate them into contiguous active regions and + * add these blocks of work to the work queue + * band-pass filter the list of isActive probabilities and turn into active regions + * + * @param profile + * @param activeRegions + * @param activeRegionExtension + * @param maxRegionSize + * @return + */ + private ActivityProfile incorporateActiveRegions(final ActivityProfile profile, + final List activeRegions, + final int activeRegionExtension, + final int maxRegionSize) { + if ( profile.isEmpty() ) + throw new IllegalStateException("trying to incorporate an empty active profile " + profile); + + final ActivityProfile bandPassFiltered = profile.bandPassFilter(); + activeRegions.addAll(bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize )); + return new ActivityProfile( engine.getGenomeLocParser(), profile.hasPresetRegions() ); + } + + + // -------------------------------------------------------------------------------- + // + // simple utility functions + // + // -------------------------------------------------------------------------------- + + private final ActivityProfileResult walkerActiveProb(final ActiveRegionWalker walker, + final RefMetaDataTracker tracker, final ReferenceContext refContext, + final AlignmentContext locus, final GenomeLoc location) { + if ( walker.hasPresetActiveRegions() ) { + return new ActivityProfileResult(location, walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0); + } else { + return walker.isActive( tracker, refContext, locus ); + } + } + + private ReferenceOrderedView getReferenceOrderedView( final ActiveRegionWalker walker, + final LocusShardDataProvider dataProvider, + final LocusView locusView) { + if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) + return new ManagingReferenceOrderedView( dataProvider ); + else + return (RodLocusView)locusView; + } + + // -------------------------------------------------------------------------------- + // + // code to handle processing active regions + // + // -------------------------------------------------------------------------------- + + private T processActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) { + if( walker.activeRegionOutStream != null ) { + writeActiveRegionsToStream(walker); + return sum; + } else { + return callWalkerMapOnActiveRegions(walker, sum, minStart, currentContig); + } + } + + /** + * Write out each active region to the walker activeRegionOutStream + * + * @param walker + */ + private void writeActiveRegionsToStream( final ActiveRegionWalker walker ) { + // Just want to output the active regions to a file, not actually process them + for( final ActiveRegion activeRegion : workQueue ) { + if( activeRegion.isActive ) { + walker.activeRegionOutStream.println( activeRegion.getLocation() ); + } + } + } + + private T callWalkerMapOnActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) { + // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them + // TODO can implement parallel traversal here + while( workQueue.peek() != null ) { + final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc(); + if ( extendedLoc.getStop() < minStart || (currentContig != null && !workQueue.peek().getExtendedLoc().getContig().equals(currentContig))) { + final ActiveRegion activeRegion = workQueue.remove(); + sum = processActiveRegion(activeRegion, sum, walker); + } else { + break; + } + } + + return sum; + } + + private T processActiveRegion( final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker walker ) { + final ArrayList placedReads = new ArrayList(); + for( final GATKSAMRecord read : myReads ) { + final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); + if( activeRegion.getLocation().overlapsP( readLoc ) ) { + // The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region) + long maxOverlap = activeRegion.getLocation().sizeOfOverlap( readLoc ); + ActiveRegion bestRegion = activeRegion; + for( final ActiveRegion otherRegionToTest : workQueue ) { + if( otherRegionToTest.getLocation().sizeOfOverlap(readLoc) >= maxOverlap ) { + maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( readLoc ); + bestRegion = otherRegionToTest; + } + } + bestRegion.add( read ); + + // The read is also added to all other regions in which it overlaps but marked as non-primary + if( walker.wantsNonPrimaryReads() ) { + if( !bestRegion.equals(activeRegion) ) { + activeRegion.add( read ); + } + for( final ActiveRegion otherRegionToTest : workQueue ) { + if( !bestRegion.equals(otherRegionToTest) ) { + // check for non-primary vs. extended + if ( otherRegionToTest.getLocation().overlapsP( readLoc ) ) { + otherRegionToTest.add( read ); + } else if ( walker.wantsExtendedReads() && otherRegionToTest.getExtendedLoc().overlapsP( readLoc ) ) { + otherRegionToTest.add( read ); + } + } + } + } + placedReads.add( read ); + // check for non-primary vs. extended + } else if( activeRegion.getLocation().overlapsP( readLoc ) ) { + if ( walker.wantsNonPrimaryReads() ) { + activeRegion.add( read ); + } + } else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) { + activeRegion.add( read ); + } + } + myReads.removeAll( placedReads ); // remove all the reads which have been placed into their active region + // WARNING: This hashset relies on reads being exactly equal when they are placed in the list as when they are removed. So the ActiveRegionWalker can't modify the reads in any way. + + logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); + final M x = walker.map(activeRegion, null); + return walker.reduce( x, sum ); + } + + /** + * Special function called in LinearMicroScheduler to empty out the work queue. + * Ugly for now but will be cleaned up when we push this functionality more into the engine + */ + public T endTraversal( final Walker walker, T sum) { + return processActiveRegions((ActiveRegionWalker)walker, sum, Integer.MAX_VALUE, null); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalReadShardTraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalReadShardTraverseActiveRegions.java new file mode 100644 index 000000000..516a7dc34 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalReadShardTraverseActiveRegions.java @@ -0,0 +1,290 @@ +package org.broadinstitute.sting.gatk.traversals; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.WalkerManager; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.providers.*; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension; +import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.activeregion.ActivityProfile; +import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.*; + +public class ExperimentalReadShardTraverseActiveRegions extends TraversalEngine,LocusShardDataProvider> { + /** + * our log, which we want to capture anything from this class + */ + protected final static Logger logger = Logger.getLogger(TraversalEngine.class); + + private final LinkedList workQueue = new LinkedList(); + private final LinkedHashSet myReads = new LinkedHashSet(); + + @Override + public String getTraversalUnits() { + return "active regions"; + } + + @Override + public T traverse( final ActiveRegionWalker walker, + final LocusShardDataProvider dataProvider, + T sum) { + logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider)); + + final LocusView locusView = new AllLocusView(dataProvider); + + final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); + final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension(); + final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion(); + + int minStart = Integer.MAX_VALUE; + final List activeRegions = new LinkedList(); + ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); + + ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); + + // We keep processing while the next reference location is within the interval + GenomeLoc prevLoc = null; + while( locusView.hasNext() ) { + final AlignmentContext locus = locusView.next(); + final GenomeLoc location = locus.getLocation(); + + // Grab all the previously unseen reads from this pileup and add them to the massive read list + // Note that this must occur before we leave because we are outside the intervals because + // reads may occur outside our intervals but overlap them in the future + // TODO -- this whole HashSet logic should be changed to a linked list of reads with + // TODO -- subsequent pass over them to find the ones overlapping the active regions + for( final PileupElement p : locus.getBasePileup() ) { + final GATKSAMRecord read = p.getRead(); + if( !myReads.contains(read) ) { + myReads.add(read); + } + + // If this is the last pileup for this shard calculate the minimum alignment start so that we know + // which active regions in the work queue are now safe to process + minStart = Math.min(minStart, read.getAlignmentStart()); + } + + // skip this location -- it's not part of our engine intervals + if ( outsideEngineIntervals(location) ) + continue; + + if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) { + // we've move across some interval boundary, restart profile + profile = incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); + } + + dataProvider.getShard().getReadMetrics().incrementNumIterations(); + + // create reference context. Note that if we have a pileup of "extended events", the context will + // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). + final ReferenceContext refContext = referenceView.getReferenceContext(location); + + // Iterate forward to get all reference ordered data covering this location + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); + + // Call the walkers isActive function for this locus and add them to the list to be integrated later + profile.add(walkerActiveProb(walker, tracker, refContext, locus, location)); + + prevLoc = location; + + printProgress(locus.getLocation()); + } + + updateCumulativeMetrics(dataProvider.getShard()); + + if ( ! profile.isEmpty() ) + incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); + + // add active regions to queue of regions to process + // first check if can merge active regions over shard boundaries + if( !activeRegions.isEmpty() ) { + if( !workQueue.isEmpty() ) { + final ActiveRegion last = workQueue.getLast(); + final ActiveRegion first = activeRegions.get(0); + if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) { + workQueue.removeLast(); + activeRegions.remove(first); + workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) ); + } + } + workQueue.addAll( activeRegions ); + } + + logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); + + // now go and process all of the active regions + sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig()); + + return sum; + } + + /** + * Is the loc outside of the intervals being requested for processing by the GATK? + * @param loc + * @return + */ + private boolean outsideEngineIntervals(final GenomeLoc loc) { + return engine.getIntervals() != null && ! engine.getIntervals().overlaps(loc); + } + + /** + * Take the individual isActive calls and integrate them into contiguous active regions and + * add these blocks of work to the work queue + * band-pass filter the list of isActive probabilities and turn into active regions + * + * @param profile + * @param activeRegions + * @param activeRegionExtension + * @param maxRegionSize + * @return + */ + private ActivityProfile incorporateActiveRegions(final ActivityProfile profile, + final List activeRegions, + final int activeRegionExtension, + final int maxRegionSize) { + if ( profile.isEmpty() ) + throw new IllegalStateException("trying to incorporate an empty active profile " + profile); + + final ActivityProfile bandPassFiltered = profile.bandPassFilter(); + activeRegions.addAll(bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize )); + return new ActivityProfile( engine.getGenomeLocParser(), profile.hasPresetRegions() ); + } + + + // -------------------------------------------------------------------------------- + // + // simple utility functions + // + // -------------------------------------------------------------------------------- + + private final ActivityProfileResult walkerActiveProb(final ActiveRegionWalker walker, + final RefMetaDataTracker tracker, final ReferenceContext refContext, + final AlignmentContext locus, final GenomeLoc location) { + if ( walker.hasPresetActiveRegions() ) { + return new ActivityProfileResult(location, walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0); + } else { + return walker.isActive( tracker, refContext, locus ); + } + } + + private ReferenceOrderedView getReferenceOrderedView( final ActiveRegionWalker walker, + final LocusShardDataProvider dataProvider, + final LocusView locusView) { + if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) + return new ManagingReferenceOrderedView( dataProvider ); + else + return (RodLocusView)locusView; + } + + // -------------------------------------------------------------------------------- + // + // code to handle processing active regions + // + // -------------------------------------------------------------------------------- + + private T processActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) { + if( walker.activeRegionOutStream != null ) { + writeActiveRegionsToStream(walker); + return sum; + } else { + return callWalkerMapOnActiveRegions(walker, sum, minStart, currentContig); + } + } + + /** + * Write out each active region to the walker activeRegionOutStream + * + * @param walker + */ + private void writeActiveRegionsToStream( final ActiveRegionWalker walker ) { + // Just want to output the active regions to a file, not actually process them + for( final ActiveRegion activeRegion : workQueue ) { + if( activeRegion.isActive ) { + walker.activeRegionOutStream.println( activeRegion.getLocation() ); + } + } + } + + private T callWalkerMapOnActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) { + // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them + // TODO can implement parallel traversal here + while( workQueue.peek() != null ) { + final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc(); + if ( extendedLoc.getStop() < minStart || (currentContig != null && !workQueue.peek().getExtendedLoc().getContig().equals(currentContig))) { + final ActiveRegion activeRegion = workQueue.remove(); + sum = processActiveRegion(activeRegion, sum, walker); + } else { + break; + } + } + + return sum; + } + + private T processActiveRegion( final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker walker ) { + final ArrayList placedReads = new ArrayList(); + for( final GATKSAMRecord read : myReads ) { + final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); + if( activeRegion.getLocation().overlapsP( readLoc ) ) { + // The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region) + long maxOverlap = activeRegion.getLocation().sizeOfOverlap( readLoc ); + ActiveRegion bestRegion = activeRegion; + for( final ActiveRegion otherRegionToTest : workQueue ) { + if( otherRegionToTest.getLocation().sizeOfOverlap(readLoc) >= maxOverlap ) { + maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( readLoc ); + bestRegion = otherRegionToTest; + } + } + bestRegion.add( read ); + + // The read is also added to all other regions in which it overlaps but marked as non-primary + if( walker.wantsNonPrimaryReads() ) { + if( !bestRegion.equals(activeRegion) ) { + activeRegion.add( read ); + } + for( final ActiveRegion otherRegionToTest : workQueue ) { + if( !bestRegion.equals(otherRegionToTest) ) { + // check for non-primary vs. extended + if ( otherRegionToTest.getLocation().overlapsP( readLoc ) ) { + otherRegionToTest.add( read ); + } else if ( walker.wantsExtendedReads() && otherRegionToTest.getExtendedLoc().overlapsP( readLoc ) ) { + otherRegionToTest.add( read ); + } + } + } + } + placedReads.add( read ); + // check for non-primary vs. extended + } else if( activeRegion.getLocation().overlapsP( readLoc ) ) { + if ( walker.wantsNonPrimaryReads() ) { + activeRegion.add( read ); + } + } else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) { + activeRegion.add( read ); + } + } + myReads.removeAll( placedReads ); // remove all the reads which have been placed into their active region + // WARNING: This hashset relies on reads being exactly equal when they are placed in the list as when they are removed. So the ActiveRegionWalker can't modify the reads in any way. + + logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); + final M x = walker.map(activeRegion, null); + return walker.reduce( x, sum ); + } + + /** + * Special function called in LinearMicroScheduler to empty out the work queue. + * Ugly for now but will be cleaned up when we push this functionality more into the engine + */ + public T endTraversal( final Walker walker, T sum) { + return processActiveRegions((ActiveRegionWalker)walker, sum, Integer.MAX_VALUE, null); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index c12dfcee9..3e3bb220a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -30,6 +30,7 @@ public class ActiveRegion implements HasGenomeLocation { this.activeRegionLoc = activeRegionLoc; this.isActive = isActive; this.genomeLocParser = genomeLocParser; + this.extension = extension; extendedLoc = genomeLocParser.createGenomeLocOnContig(activeRegionLoc.getContig(), activeRegionLoc.getStart() - extension, activeRegionLoc.getStop() + extension); fullExtentReferenceLoc = extendedLoc; diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ExperimentalActiveRegionShardType.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ExperimentalActiveRegionShardType.java new file mode 100644 index 000000000..1e9a0ee94 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ExperimentalActiveRegionShardType.java @@ -0,0 +1,14 @@ +package org.broadinstitute.sting.utils.activeregion; + +/** + * Created with IntelliJ IDEA. + * User: thibault + * Date: 1/2/13 + * Time: 4:59 PM + * To change this template use File | Settings | File Templates. + */ +public enum ExperimentalActiveRegionShardType { + LOCUSSHARD, // default/legacy type + READSHARD, + ACTIVEREGIONSHARD +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java index 8f2f2be70..645ba3f3f 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java @@ -3,10 +3,12 @@ package org.broadinstitute.sting.gatk.traversals; import com.google.java.contract.PreconditionError; import net.sf.samtools.*; import org.broadinstitute.sting.commandline.Tags; +import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.datasources.reads.*; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; +import org.broadinstitute.sting.utils.activeregion.ExperimentalActiveRegionShardType; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -478,6 +480,9 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { private List createDataProviders(List intervals, String bamFile) { GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); engine.setGenomeLocParser(genomeLocParser); + GATKArgumentCollection arguments = new GATKArgumentCollection(); + arguments.activeRegionShardType = ExperimentalActiveRegionShardType.LOCUSSHARD; // make explicit + engine.setArguments(arguments); t.initialize(engine); Collection samFiles = new ArrayList(); From e7553545efe0101b4101383e9141afdbfe0fdf6e Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Thu, 3 Jan 2013 13:57:36 -0500 Subject: [PATCH 06/30] Initial updates for ReadShard --- .../sting/gatk/GenomeAnalysisEngine.java | 28 ++- .../providers/LocusShardDataProvider.java | 16 ++ ...imentalReadShardTraverseActiveRegions.java | 235 ++++++++++-------- .../utils/activeregion/ActiveRegion.java | 19 +- .../TraverseActiveRegionsUnitTest.java | 73 ++++-- 5 files changed, 242 insertions(+), 129 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 1187039bb..f17450247 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -570,11 +570,29 @@ public class GenomeAnalysisEngine { else if(walker instanceof ActiveRegionWalker) { if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Active region walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); - if(intervals == null) - return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer()); - else - return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new LocusShardBalancer()); - } + + switch(argCollection.activeRegionShardType) { + case LOCUSSHARD: + if(intervals == null) + return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer()); + else + return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new LocusShardBalancer()); + case READSHARD: + // Use the legacy ReadShardBalancer if legacy downsampling is enabled + ShardBalancer readShardBalancer = downsamplingMethod != null && downsamplingMethod.useLegacyDownsampler ? + new LegacyReadShardBalancer() : + new ReadShardBalancer(); + + if(intervals == null) + return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(), readShardBalancer); + else + return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), readShardBalancer); + case ACTIVEREGIONSHARD: + throw new UserException.CommandLineException("Not implemented."); + default: + throw new UserException.CommandLineException("Invalid active region shard type."); + } + } else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) { // Apply special validation to read pair walkers. if(walker instanceof ReadPairWalker) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java index 4888b9f41..1607469eb 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java @@ -44,6 +44,22 @@ public class LocusShardDataProvider extends ShardDataProvider { this.locusIterator = locusIterator; } + /** + * Create a data provider based on an input provider + * Used only by ExperimentalReadShardTraverseActiveRegions + * @param dataProvider + * @param sourceInfo + * @param genomeLocParser + * @param locus + * @param locusIterator + */ + public LocusShardDataProvider(ShardDataProvider dataProvider, ReadProperties sourceInfo, GenomeLocParser genomeLocParser, GenomeLoc locus, LocusIterator locusIterator) { + super(dataProvider.getShard(),genomeLocParser,dataProvider.getReference(),dataProvider.getReferenceOrderedData()); + this.sourceInfo = sourceInfo; + this.locus = locus; + this.locusIterator = locusIterator; + } + /** * Returns information about the source of the reads. * @return Info about the source of the reads. diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalReadShardTraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalReadShardTraverseActiveRegions.java index 516a7dc34..672d37f7f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalReadShardTraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalReadShardTraverseActiveRegions.java @@ -1,32 +1,35 @@ package org.broadinstitute.sting.gatk.traversals; +import net.sf.samtools.SAMFileHeader; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.WalkerManager; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.providers.*; +import org.broadinstitute.sting.gatk.datasources.reads.Shard; +import org.broadinstitute.sting.gatk.executive.WindowMaker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension; import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActivityProfile; import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; -import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.*; -public class ExperimentalReadShardTraverseActiveRegions extends TraversalEngine,LocusShardDataProvider> { +public class ExperimentalReadShardTraverseActiveRegions extends TraversalEngine,ReadShardDataProvider> { /** * our log, which we want to capture anything from this class */ protected final static Logger logger = Logger.getLogger(TraversalEngine.class); private final LinkedList workQueue = new LinkedList(); - private final LinkedHashSet myReads = new LinkedHashSet(); + private final LinkedList myReads = new LinkedList(); @Override public String getTraversalUnits() { @@ -35,71 +38,65 @@ public class ExperimentalReadShardTraverseActiveRegions extends TraversalEn @Override public T traverse( final ActiveRegionWalker walker, - final LocusShardDataProvider dataProvider, + final ReadShardDataProvider readDataProvider, T sum) { - logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider)); + logger.debug(String.format("TraverseActiveRegion.traverse: Read Shard is %s", readDataProvider)); - final LocusView locusView = new AllLocusView(dataProvider); - - final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension(); final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion(); - int minStart = Integer.MAX_VALUE; + final ReadView readView = new ReadView(readDataProvider); + final List activeRegions = new LinkedList(); - ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); + ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions()); - ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); + Shard readShard = readDataProvider.getShard(); + SAMFileHeader header = readShard.getReadProperties().getHeader(); + WindowMaker windowMaker = new WindowMaker(readShard, engine.getGenomeLocParser(), + readView.iterator(), readShard.getGenomeLocs(), SampleUtils.getSAMFileSamples(header)); - // We keep processing while the next reference location is within the interval - GenomeLoc prevLoc = null; - while( locusView.hasNext() ) { - final AlignmentContext locus = locusView.next(); - final GenomeLoc location = locus.getLocation(); + for(WindowMaker.WindowMakerIterator iterator: windowMaker) { + LocusShardDataProvider locusDataProvider = new LocusShardDataProvider(readDataProvider, + iterator.getSourceInfo(), engine.getGenomeLocParser(), iterator.getLocus(), iterator); - // Grab all the previously unseen reads from this pileup and add them to the massive read list - // Note that this must occur before we leave because we are outside the intervals because - // reads may occur outside our intervals but overlap them in the future - // TODO -- this whole HashSet logic should be changed to a linked list of reads with - // TODO -- subsequent pass over them to find the ones overlapping the active regions - for( final PileupElement p : locus.getBasePileup() ) { - final GATKSAMRecord read = p.getRead(); - if( !myReads.contains(read) ) { - myReads.add(read); + final LocusView locusView = new AllLocusView(locusDataProvider); + final LocusReferenceView referenceView = new LocusReferenceView( walker, locusDataProvider ); + ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, locusDataProvider, locusView); + + // We keep processing while the next reference location is within the interval + GenomeLoc prevLoc = null; + while( locusView.hasNext() ) { + final AlignmentContext locus = locusView.next(); + final GenomeLoc location = locus.getLocation(); + + if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) { + // we've move across some interval boundary, restart profile + profile = incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); } - // If this is the last pileup for this shard calculate the minimum alignment start so that we know - // which active regions in the work queue are now safe to process - minStart = Math.min(minStart, read.getAlignmentStart()); + readDataProvider.getShard().getReadMetrics().incrementNumIterations(); + + // create reference context. Note that if we have a pileup of "extended events", the context will + // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). + final ReferenceContext refContext = referenceView.getReferenceContext(location); + + // Iterate forward to get all reference ordered data covering this location + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); + + // Call the walkers isActive function for this locus and add them to the list to be integrated later + profile.add(walkerActiveProb(walker, tracker, refContext, locus, location)); + + prevLoc = location; + + printProgress(locus.getLocation()); } - // skip this location -- it's not part of our engine intervals - if ( outsideEngineIntervals(location) ) - continue; - - if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) { - // we've move across some interval boundary, restart profile - profile = incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); - } - - dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - // create reference context. Note that if we have a pileup of "extended events", the context will - // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). - final ReferenceContext refContext = referenceView.getReferenceContext(location); - - // Iterate forward to get all reference ordered data covering this location - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); - - // Call the walkers isActive function for this locus and add them to the list to be integrated later - profile.add(walkerActiveProb(walker, tracker, refContext, locus, location)); - - prevLoc = location; - - printProgress(locus.getLocation()); + locusDataProvider.close(); } - updateCumulativeMetrics(dataProvider.getShard()); + windowMaker.close(); + + updateCumulativeMetrics(readDataProvider.getShard()); if ( ! profile.isEmpty() ) incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); @@ -113,7 +110,7 @@ public class ExperimentalReadShardTraverseActiveRegions extends TraversalEn if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) { workQueue.removeLast(); activeRegions.remove(first); - workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) ); + workQueue.addLast(new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension)); } } workQueue.addAll( activeRegions ); @@ -121,21 +118,13 @@ public class ExperimentalReadShardTraverseActiveRegions extends TraversalEn logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); - // now go and process all of the active regions - sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig()); + // now process the active regions, where possible + boolean emptyQueue = false; + sum = processActiveRegions(walker, sum, emptyQueue); return sum; } - /** - * Is the loc outside of the intervals being requested for processing by the GATK? - * @param loc - * @return - */ - private boolean outsideEngineIntervals(final GenomeLoc loc) { - return engine.getIntervals() != null && ! engine.getIntervals().overlaps(loc); - } - /** * Take the individual isActive calls and integrate them into contiguous active regions and * add these blocks of work to the work queue @@ -191,12 +180,12 @@ public class ExperimentalReadShardTraverseActiveRegions extends TraversalEn // // -------------------------------------------------------------------------------- - private T processActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) { + private T processActiveRegions( final ActiveRegionWalker walker, T sum, boolean emptyQueue ) { if( walker.activeRegionOutStream != null ) { writeActiveRegionsToStream(walker); return sum; } else { - return callWalkerMapOnActiveRegions(walker, sum, minStart, currentContig); + return callWalkerMapOnActiveRegions(walker, sum, emptyQueue); } } @@ -214,70 +203,99 @@ public class ExperimentalReadShardTraverseActiveRegions extends TraversalEn } } - private T callWalkerMapOnActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) { - // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them + private T callWalkerMapOnActiveRegions( final ActiveRegionWalker walker, T sum, boolean emptyQueue ) { + final int lastRegionStart = workQueue.getLast().getLocation().getStart(); + final String lastRegionContig = workQueue.getLast().getLocation().getContig(); + + // If we've traversed sufficiently past the beginning of the workQueue we can unload those regions and process them // TODO can implement parallel traversal here - while( workQueue.peek() != null ) { - final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc(); - if ( extendedLoc.getStop() < minStart || (currentContig != null && !workQueue.peek().getExtendedLoc().getContig().equals(currentContig))) { - final ActiveRegion activeRegion = workQueue.remove(); - sum = processActiveRegion(activeRegion, sum, walker); - } else { - break; + while( workQueue.peekFirst() != null ) { + ActiveRegion firstRegion = workQueue.getFirst(); + final String firstRegionContig = firstRegion.getLocation().getContig(); + if (emptyQueue || firstRegionContig != lastRegionContig) { + sum = processFirstActiveRegion(sum, walker); + } + else { + final int firstRegionMaxReadStop = walker.wantsExtendedReads() ? firstRegion.getMaxReadStop() : firstRegion.getExtendedMaxReadStop(); + if (lastRegionStart > firstRegionMaxReadStop) { + sum = processFirstActiveRegion( sum, walker ); + } + else { + break; + } } } return sum; } - private T processActiveRegion( final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker walker ) { - final ArrayList placedReads = new ArrayList(); - for( final GATKSAMRecord read : myReads ) { - final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); - if( activeRegion.getLocation().overlapsP( readLoc ) ) { + /** + * Process the first active region and all remaining reads which overlap + * + * Remove the first active region from the queue + * (NB: some reads associated with this active region may have already been processed) + * + * Remove all of these reads from the queue + * (NB: some may be associated with other active regions) + * + * @param sum + * @param walker + * @return + */ + private T processFirstActiveRegion( final T sum, final ActiveRegionWalker walker ) { + final ActiveRegion firstRegion = workQueue.removeFirst(); + + GATKSAMRecord firstRead = myReads.peekFirst(); // don't remove because it may not be placed here + GenomeLoc firstReadLoc = this.engine.getGenomeLocParser().createGenomeLoc( firstRead ); + + while ( firstRegion.getLocation().overlapsP( firstReadLoc ) || + (walker.wantsExtendedReads() && firstRegion.getExtendedLoc().overlapsP( firstReadLoc ))) { + if( firstRegion.getLocation().overlapsP( firstReadLoc ) ) { // The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region) - long maxOverlap = activeRegion.getLocation().sizeOfOverlap( readLoc ); - ActiveRegion bestRegion = activeRegion; + long maxOverlap = firstRegion.getLocation().sizeOfOverlap( firstReadLoc ); + ActiveRegion bestRegion = firstRegion; for( final ActiveRegion otherRegionToTest : workQueue ) { - if( otherRegionToTest.getLocation().sizeOfOverlap(readLoc) >= maxOverlap ) { - maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( readLoc ); + if( otherRegionToTest.getLocation().sizeOfOverlap(firstReadLoc) >= maxOverlap ) { + maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( firstReadLoc ); bestRegion = otherRegionToTest; } } - bestRegion.add( read ); + bestRegion.add( firstRead ); // The read is also added to all other regions in which it overlaps but marked as non-primary if( walker.wantsNonPrimaryReads() ) { - if( !bestRegion.equals(activeRegion) ) { - activeRegion.add( read ); + if( !bestRegion.equals(firstRegion) ) { + firstRegion.add(firstRead); } for( final ActiveRegion otherRegionToTest : workQueue ) { if( !bestRegion.equals(otherRegionToTest) ) { // check for non-primary vs. extended - if ( otherRegionToTest.getLocation().overlapsP( readLoc ) ) { - otherRegionToTest.add( read ); - } else if ( walker.wantsExtendedReads() && otherRegionToTest.getExtendedLoc().overlapsP( readLoc ) ) { - otherRegionToTest.add( read ); + if ( otherRegionToTest.getLocation().overlapsP( firstReadLoc ) ) { + otherRegionToTest.add( firstRead ); + } else if ( walker.wantsExtendedReads() && otherRegionToTest.getExtendedLoc().overlapsP( firstReadLoc ) ) { + otherRegionToTest.add( firstRead ); } } } } - placedReads.add( read ); - // check for non-primary vs. extended - } else if( activeRegion.getLocation().overlapsP( readLoc ) ) { - if ( walker.wantsNonPrimaryReads() ) { - activeRegion.add( read ); - } - } else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) { - activeRegion.add( read ); - } - } - myReads.removeAll( placedReads ); // remove all the reads which have been placed into their active region - // WARNING: This hashset relies on reads being exactly equal when they are placed in the list as when they are removed. So the ActiveRegionWalker can't modify the reads in any way. - logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); - final M x = walker.map(activeRegion, null); - return walker.reduce( x, sum ); + // check for non-primary vs. extended + } else if( firstRegion.getLocation().overlapsP( firstReadLoc ) ) { + if ( walker.wantsNonPrimaryReads() ) { + firstRegion.add( firstRead ); + } + } else if( walker.wantsExtendedReads() && firstRegion.getExtendedLoc().overlapsP( firstReadLoc )) { + firstRegion.add( firstRead ); + } + + myReads.removeFirst(); + firstRead = myReads.peekFirst(); + firstReadLoc = this.engine.getGenomeLocParser().createGenomeLoc( firstRead ); + } + + logger.debug(">> Map call with " + firstRegion.getReads().size() + " " + (firstRegion.isActive ? "active" : "inactive") + " reads @ " + firstRegion.getLocation() + " with full extent: " + firstRegion.getReferenceLoc()); + final M x = walker.map( firstRegion, null ); + return walker.reduce(x, sum); } /** @@ -285,6 +303,7 @@ public class ExperimentalReadShardTraverseActiveRegions extends TraversalEn * Ugly for now but will be cleaned up when we push this functionality more into the engine */ public T endTraversal( final Walker walker, T sum) { - return processActiveRegions((ActiveRegionWalker)walker, sum, Integer.MAX_VALUE, null); + boolean emptyQueue = true; + return processActiveRegions((ActiveRegionWalker)walker, sum, emptyQueue); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index 3e3bb220a..d1199ad3d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -26,14 +26,19 @@ public class ActiveRegion implements HasGenomeLocation { private final GenomeLocParser genomeLocParser; public final boolean isActive; + // maximum stop position of all reads with start position in this active region + // Used only by ExperimentalReadShardTraverseActiveRegions + // NB: these reads may not be associated with this active region! + private int maxReadStop; + public ActiveRegion( final GenomeLoc activeRegionLoc, final boolean isActive, final GenomeLocParser genomeLocParser, final int extension ) { this.activeRegionLoc = activeRegionLoc; this.isActive = isActive; this.genomeLocParser = genomeLocParser; - this.extension = extension; extendedLoc = genomeLocParser.createGenomeLocOnContig(activeRegionLoc.getContig(), activeRegionLoc.getStart() - extension, activeRegionLoc.getStop() + extension); fullExtentReferenceLoc = extendedLoc; + maxReadStop = activeRegionLoc.getStart(); } @Override @@ -94,6 +99,18 @@ public class ActiveRegion implements HasGenomeLocation { public void remove( final GATKSAMRecord read ) { reads.remove( read ); } public void removeAll( final ArrayList readsToRemove ) { reads.removeAll( readsToRemove ); } + public void setMaxReadStop(int maxReadStop) { + this.maxReadStop = maxReadStop; + } + + public int getMaxReadStop() { + return maxReadStop; + } + + public int getExtendedMaxReadStop() { + return maxReadStop + extension; + } + public boolean equalExceptReads(final ActiveRegion other) { if ( activeRegionLoc.compareTo(other.activeRegionLoc) != 0 ) return false; if ( isActive != other.isActive ) return false; diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java index 645ba3f3f..1204e639e 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java @@ -4,6 +4,9 @@ import com.google.java.contract.PreconditionError; import net.sf.samtools.*; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; +import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider; +import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; +import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.datasources.reads.*; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.utils.GenomeLocSortedSet; @@ -17,7 +20,6 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.executive.WindowMaker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -97,7 +99,9 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { } } - private final TraverseActiveRegions t = new TraverseActiveRegions(); + private final TraverseActiveRegions traverse = new TraverseActiveRegions(); + private final ExperimentalReadShardTraverseActiveRegions readShardTraverse = new ExperimentalReadShardTraverseActiveRegions(); + private final ExperimentalActiveRegionShardTraverseActiveRegions activeRegionShardTraverse = new ExperimentalActiveRegionShardTraverseActiveRegions(); private IndexedFastaSequenceFile reference; private SAMSequenceDictionary dictionary; @@ -108,6 +112,8 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { private static final String testBAM = "TraverseActiveRegionsUnitTest.bam"; private static final String testBAI = "TraverseActiveRegionsUnitTest.bai"; + private static final ExperimentalActiveRegionShardType shardType = ExperimentalActiveRegionShardType.LOCUSSHARD; + @BeforeClass private void init() throws FileNotFoundException { reference = new CachingIndexedFastaSequenceFile(new File(hg19Reference)); @@ -175,8 +181,8 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { private List getIsActiveIntervals(DummyActiveRegionWalker walker, List intervals) { List activeIntervals = new ArrayList(); - for (LocusShardDataProvider dataProvider : createDataProviders(intervals, testBAM)) { - t.traverse(walker, dataProvider, 0); + for (ShardDataProvider dataProvider : createDataProviders(intervals, testBAM)) { + traverse(walker, dataProvider, 0); activeIntervals.addAll(walker.isActiveCalls); } @@ -413,10 +419,10 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { } private Map getActiveRegions(DummyActiveRegionWalker walker, List intervals) { - for (LocusShardDataProvider dataProvider : createDataProviders(intervals, testBAM)) - t.traverse(walker, dataProvider, 0); + for (ShardDataProvider dataProvider : createDataProviders(intervals, testBAM)) + traverse(walker, dataProvider, 0); - t.endTraversal(walker, 0); + endTraversal(walker, 0); return walker.mappedActiveRegions; } @@ -477,13 +483,12 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { return record; } - private List createDataProviders(List intervals, String bamFile) { + private List createDataProviders(List intervals, String bamFile) { GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); engine.setGenomeLocParser(genomeLocParser); GATKArgumentCollection arguments = new GATKArgumentCollection(); - arguments.activeRegionShardType = ExperimentalActiveRegionShardType.LOCUSSHARD; // make explicit + arguments.activeRegionShardType = shardType; // make explicit engine.setArguments(arguments); - t.initialize(engine); Collection samFiles = new ArrayList(); SAMReaderID readerID = new SAMReaderID(new File(bamFile), new Tags()); @@ -491,13 +496,51 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { SAMDataSource dataSource = new SAMDataSource(samFiles, new ThreadAllocation(), null, genomeLocParser); - List providers = new ArrayList(); - for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new LocusShardBalancer())) { - for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs())) { - providers.add(new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList())); - } + List providers = new ArrayList(); + + switch (shardType) { + case LOCUSSHARD: + traverse.initialize(engine); + for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new LocusShardBalancer())) { + for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs())) { + providers.add(new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList())); + } + } + break; + case READSHARD: + readShardTraverse.initialize(engine); + for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new ReadShardBalancer())) { + providers.add(new ReadShardDataProvider(shard, genomeLocParser, shard.iterator(), reference, new ArrayList())); + } + break; + default: // other types not implemented } return providers; } + + private void traverse(DummyActiveRegionWalker walker, ShardDataProvider dataProvider, int i) { + switch (shardType) { + case LOCUSSHARD: + traverse.traverse(walker, (LocusShardDataProvider) dataProvider, i); + break; + case READSHARD: + readShardTraverse.traverse(walker, (ReadShardDataProvider) dataProvider, i); + break; + default: // other types not implemented + } + } + + private void endTraversal(DummyActiveRegionWalker walker, int i) { + switch (shardType) { + case LOCUSSHARD: + traverse.endTraversal(walker, i); + break; + case READSHARD: + readShardTraverse.endTraversal(walker, i); + break; + default: // other types not implemented + } + } + } From 319d651e4a7b5198cd266ca337ebc9750e53fe9a Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Thu, 3 Jan 2013 15:49:40 -0500 Subject: [PATCH 07/30] Initial updates for ActiveRegionShard --- .../sting/gatk/GenomeAnalysisEngine.java | 5 +- .../ActiveRegionShardDataProvider.java | 58 +++++ .../datasources/reads/ActiveRegionShard.java | 41 +++ .../reads/ActiveRegionShardBalancer.java | 32 +++ .../sting/gatk/datasources/reads/Shard.java | 4 +- .../gatk/executive/LinearMicroScheduler.java | 13 + ...ctiveRegionShardTraverseActiveRegions.java | 233 ++++++++++-------- ...imentalReadShardTraverseActiveRegions.java | 2 +- .../traversals/TraverseActiveRegions.java | 2 +- .../TraverseActiveRegionsUnitTest.java | 22 +- 10 files changed, 298 insertions(+), 114 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ActiveRegionShardDataProvider.java create mode 100755 public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShard.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShardBalancer.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index f17450247..bee25dc2f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -588,7 +588,10 @@ public class GenomeAnalysisEngine { else return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), readShardBalancer); case ACTIVEREGIONSHARD: - throw new UserException.CommandLineException("Not implemented."); + if(intervals == null) + return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new ActiveRegionShardBalancer()); + else + return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new ActiveRegionShardBalancer()); default: throw new UserException.CommandLineException("Invalid active region shard type."); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ActiveRegionShardDataProvider.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ActiveRegionShardDataProvider.java new file mode 100644 index 000000000..55e51f934 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ActiveRegionShardDataProvider.java @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.providers; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.broadinstitute.sting.gatk.ReadProperties; +import org.broadinstitute.sting.gatk.datasources.reads.Shard; +import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.iterators.LocusIterator; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; + +import java.util.Collection; + +/** + * @author Joel Thibault + */ +public class ActiveRegionShardDataProvider extends ShardDataProvider { + final private ReadShardDataProvider readProvider; + final private LocusShardDataProvider locusProvider; + + public ActiveRegionShardDataProvider(Shard shard, ReadProperties sourceInfo, GenomeLocParser genomeLocParser, StingSAMIterator reads, GenomeLoc locus, LocusIterator locusIterator, IndexedFastaSequenceFile reference, Collection rods) { + super(shard, genomeLocParser, reference, rods); // TODO: necessary? + readProvider = new ReadShardDataProvider(shard, genomeLocParser, reads, reference, rods); + locusProvider = new LocusShardDataProvider(shard, sourceInfo, genomeLocParser, locus, locusIterator, reference, rods); + } + + public ReadShardDataProvider getReadShardDataProvider() { + return readProvider; + } + + public LocusShardDataProvider getLocusShardDataProvider(LocusIterator iterator) { + return locusProvider; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShard.java new file mode 100755 index 000000000..381b193e9 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShard.java @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.samtools.SAMFileSpan; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; + +import java.util.List; +import java.util.Map; + +/** + * @author Joel Thibault + */ +public class ActiveRegionShard extends ReadShard { + public ActiveRegionShard(GenomeLocParser parser, SAMDataSource readsDataSource, Map fileSpans, List loci, boolean isUnmapped) { + super(parser, readsDataSource, fileSpans, loci, isUnmapped); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShardBalancer.java new file mode 100644 index 000000000..338dd1bdf --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShardBalancer.java @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +/** + * @author Joel Thibault + */ +public class ActiveRegionShardBalancer extends ReadShardBalancer { + // TODO ? +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java index e22a7a54d..314156af6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java @@ -40,7 +40,9 @@ import java.util.Map; */ public abstract class Shard implements HasGenomeLocation { public enum ShardType { - READ, LOCUS + READ, + LOCUS, + ACTIVEREGION // Used only by ExperimentalActiveRegionShardTraverseActiveRegions } protected final GenomeLocParser parser; // incredibly annoying! diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index 84d975879..44f9978a6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.executive; import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.datasources.providers.ActiveRegionShardDataProvider; import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider; import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; @@ -80,6 +81,18 @@ public class LinearMicroScheduler extends MicroScheduler { } windowMaker.close(); } + else if(shard.getShardType() == Shard.ShardType.ACTIVEREGION) { + WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(), + getReadIterator(shard), shard.getGenomeLocs(), SampleUtils.getSAMFileSamples(engine)); + for(WindowMaker.WindowMakerIterator iterator: windowMaker) { + ShardDataProvider dataProvider = new ActiveRegionShardDataProvider(shard,iterator.getSourceInfo(),engine.getGenomeLocParser(),getReadIterator(shard),iterator.getLocus(),iterator,reference,rods); + Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit()); + accumulator.accumulate(dataProvider,result); + dataProvider.close(); + if ( walker.isDone() ) break; + } + windowMaker.close(); + } else { ShardDataProvider dataProvider = new ReadShardDataProvider(shard,engine.getGenomeLocParser(),getReadIterator(shard),reference,rods); Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalActiveRegionShardTraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalActiveRegionShardTraverseActiveRegions.java index 71cb89ad9..45d132678 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalActiveRegionShardTraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalActiveRegionShardTraverseActiveRegions.java @@ -1,32 +1,35 @@ package org.broadinstitute.sting.gatk.traversals; +import net.sf.samtools.SAMFileHeader; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.WalkerManager; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.providers.*; +import org.broadinstitute.sting.gatk.datasources.reads.Shard; +import org.broadinstitute.sting.gatk.executive.WindowMaker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension; import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActivityProfile; import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; -import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.*; -public class ExperimentalActiveRegionShardTraverseActiveRegions extends TraversalEngine,LocusShardDataProvider> { +public class ExperimentalActiveRegionShardTraverseActiveRegions extends TraversalEngine,ActiveRegionShardDataProvider> { /** * our log, which we want to capture anything from this class */ protected final static Logger logger = Logger.getLogger(TraversalEngine.class); private final LinkedList workQueue = new LinkedList(); - private final LinkedHashSet myReads = new LinkedHashSet(); + private final LinkedList myReads = new LinkedList(); @Override public String getTraversalUnits() { @@ -35,71 +38,65 @@ public class ExperimentalActiveRegionShardTraverseActiveRegions extends Tra @Override public T traverse( final ActiveRegionWalker walker, - final LocusShardDataProvider dataProvider, + final ActiveRegionShardDataProvider dataProvider, T sum) { - logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider)); + logger.debug(String.format("ExperimentalActiveRegionShardTraverseActiveRegions.traverse: Shard is %s", dataProvider)); - final LocusView locusView = new AllLocusView(dataProvider); + ReadShardDataProvider readDataProvider = dataProvider.getReadShardDataProvider(); - final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension(); final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion(); - int minStart = Integer.MAX_VALUE; + final ReadView readView = new ReadView(readDataProvider); + final List activeRegions = new LinkedList(); - ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); + ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions()); - ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); + Shard readShard = readDataProvider.getShard(); + SAMFileHeader header = readShard.getReadProperties().getHeader(); + WindowMaker windowMaker = new WindowMaker(readShard, engine.getGenomeLocParser(), + readView.iterator(), readShard.getGenomeLocs(), SampleUtils.getSAMFileSamples(header)); - // We keep processing while the next reference location is within the interval - GenomeLoc prevLoc = null; - while( locusView.hasNext() ) { - final AlignmentContext locus = locusView.next(); - final GenomeLoc location = locus.getLocation(); + for(WindowMaker.WindowMakerIterator iterator: windowMaker) { + LocusShardDataProvider locusDataProvider = dataProvider.getLocusShardDataProvider(iterator); + final LocusView locusView = new AllLocusView(locusDataProvider); + final LocusReferenceView referenceView = new LocusReferenceView( walker, locusDataProvider ); + ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, locusDataProvider, locusView); - // Grab all the previously unseen reads from this pileup and add them to the massive read list - // Note that this must occur before we leave because we are outside the intervals because - // reads may occur outside our intervals but overlap them in the future - // TODO -- this whole HashSet logic should be changed to a linked list of reads with - // TODO -- subsequent pass over them to find the ones overlapping the active regions - for( final PileupElement p : locus.getBasePileup() ) { - final GATKSAMRecord read = p.getRead(); - if( !myReads.contains(read) ) { - myReads.add(read); + // We keep processing while the next reference location is within the interval + GenomeLoc prevLoc = null; + while( locusView.hasNext() ) { + final AlignmentContext locus = locusView.next(); + final GenomeLoc location = locus.getLocation(); + + if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) { + // we've move across some interval boundary, restart profile + profile = incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); } - // If this is the last pileup for this shard calculate the minimum alignment start so that we know - // which active regions in the work queue are now safe to process - minStart = Math.min(minStart, read.getAlignmentStart()); + readDataProvider.getShard().getReadMetrics().incrementNumIterations(); + + // create reference context. Note that if we have a pileup of "extended events", the context will + // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). + final ReferenceContext refContext = referenceView.getReferenceContext(location); + + // Iterate forward to get all reference ordered data covering this location + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); + + // Call the walkers isActive function for this locus and add them to the list to be integrated later + profile.add(walkerActiveProb(walker, tracker, refContext, locus, location)); + + prevLoc = location; + + printProgress(locus.getLocation()); } - // skip this location -- it's not part of our engine intervals - if ( outsideEngineIntervals(location) ) - continue; - - if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) { - // we've move across some interval boundary, restart profile - profile = incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); - } - - dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - // create reference context. Note that if we have a pileup of "extended events", the context will - // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). - final ReferenceContext refContext = referenceView.getReferenceContext(location); - - // Iterate forward to get all reference ordered data covering this location - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); - - // Call the walkers isActive function for this locus and add them to the list to be integrated later - profile.add(walkerActiveProb(walker, tracker, refContext, locus, location)); - - prevLoc = location; - - printProgress(locus.getLocation()); + locusDataProvider.close(); } - updateCumulativeMetrics(dataProvider.getShard()); + windowMaker.close(); + + updateCumulativeMetrics(readDataProvider.getShard()); if ( ! profile.isEmpty() ) incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); @@ -113,7 +110,7 @@ public class ExperimentalActiveRegionShardTraverseActiveRegions extends Tra if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) { workQueue.removeLast(); activeRegions.remove(first); - workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) ); + workQueue.addLast(new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension)); } } workQueue.addAll( activeRegions ); @@ -121,21 +118,13 @@ public class ExperimentalActiveRegionShardTraverseActiveRegions extends Tra logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); - // now go and process all of the active regions - sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig()); + // now process the active regions, where possible + boolean emptyQueue = false; + sum = processActiveRegions(walker, sum, emptyQueue); return sum; } - /** - * Is the loc outside of the intervals being requested for processing by the GATK? - * @param loc - * @return - */ - private boolean outsideEngineIntervals(final GenomeLoc loc) { - return engine.getIntervals() != null && ! engine.getIntervals().overlaps(loc); - } - /** * Take the individual isActive calls and integrate them into contiguous active regions and * add these blocks of work to the work queue @@ -191,12 +180,12 @@ public class ExperimentalActiveRegionShardTraverseActiveRegions extends Tra // // -------------------------------------------------------------------------------- - private T processActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) { + private T processActiveRegions( final ActiveRegionWalker walker, T sum, boolean emptyQueue ) { if( walker.activeRegionOutStream != null ) { writeActiveRegionsToStream(walker); return sum; } else { - return callWalkerMapOnActiveRegions(walker, sum, minStart, currentContig); + return callWalkerMapOnActiveRegions(walker, sum, emptyQueue); } } @@ -214,70 +203,99 @@ public class ExperimentalActiveRegionShardTraverseActiveRegions extends Tra } } - private T callWalkerMapOnActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) { - // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them + private T callWalkerMapOnActiveRegions( final ActiveRegionWalker walker, T sum, boolean emptyQueue ) { + final int lastRegionStart = workQueue.getLast().getLocation().getStart(); + final String lastRegionContig = workQueue.getLast().getLocation().getContig(); + + // If we've traversed sufficiently past the beginning of the workQueue we can unload those regions and process them // TODO can implement parallel traversal here - while( workQueue.peek() != null ) { - final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc(); - if ( extendedLoc.getStop() < minStart || (currentContig != null && !workQueue.peek().getExtendedLoc().getContig().equals(currentContig))) { - final ActiveRegion activeRegion = workQueue.remove(); - sum = processActiveRegion(activeRegion, sum, walker); - } else { - break; + while( workQueue.peekFirst() != null ) { + ActiveRegion firstRegion = workQueue.getFirst(); + final String firstRegionContig = firstRegion.getLocation().getContig(); + if (emptyQueue || firstRegionContig != lastRegionContig) { + sum = processFirstActiveRegion(sum, walker); + } + else { + final int firstRegionMaxReadStop = walker.wantsExtendedReads() ? firstRegion.getMaxReadStop() : firstRegion.getExtendedMaxReadStop(); + if (lastRegionStart > firstRegionMaxReadStop) { + sum = processFirstActiveRegion( sum, walker ); + } + else { + break; + } } } return sum; } - private T processActiveRegion( final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker walker ) { - final ArrayList placedReads = new ArrayList(); - for( final GATKSAMRecord read : myReads ) { - final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); - if( activeRegion.getLocation().overlapsP( readLoc ) ) { + /** + * Process the first active region and all remaining reads which overlap + * + * Remove the first active region from the queue + * (NB: some reads associated with this active region may have already been processed) + * + * Remove all of these reads from the queue + * (NB: some may be associated with other active regions) + * + * @param sum + * @param walker + * @return + */ + private T processFirstActiveRegion( final T sum, final ActiveRegionWalker walker ) { + final ActiveRegion firstRegion = workQueue.removeFirst(); + + GATKSAMRecord firstRead = myReads.peekFirst(); // don't remove because it may not be placed here + GenomeLoc firstReadLoc = this.engine.getGenomeLocParser().createGenomeLoc( firstRead ); + + while ( firstRegion.getLocation().overlapsP( firstReadLoc ) || + (walker.wantsExtendedReads() && firstRegion.getExtendedLoc().overlapsP( firstReadLoc ))) { + if( firstRegion.getLocation().overlapsP( firstReadLoc ) ) { // The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region) - long maxOverlap = activeRegion.getLocation().sizeOfOverlap( readLoc ); - ActiveRegion bestRegion = activeRegion; + long maxOverlap = firstRegion.getLocation().sizeOfOverlap( firstReadLoc ); + ActiveRegion bestRegion = firstRegion; for( final ActiveRegion otherRegionToTest : workQueue ) { - if( otherRegionToTest.getLocation().sizeOfOverlap(readLoc) >= maxOverlap ) { - maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( readLoc ); + if( otherRegionToTest.getLocation().sizeOfOverlap(firstReadLoc) >= maxOverlap ) { + maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( firstReadLoc ); bestRegion = otherRegionToTest; } } - bestRegion.add( read ); + bestRegion.add( firstRead ); // The read is also added to all other regions in which it overlaps but marked as non-primary if( walker.wantsNonPrimaryReads() ) { - if( !bestRegion.equals(activeRegion) ) { - activeRegion.add( read ); + if( !bestRegion.equals(firstRegion) ) { + firstRegion.add(firstRead); } for( final ActiveRegion otherRegionToTest : workQueue ) { if( !bestRegion.equals(otherRegionToTest) ) { // check for non-primary vs. extended - if ( otherRegionToTest.getLocation().overlapsP( readLoc ) ) { - otherRegionToTest.add( read ); - } else if ( walker.wantsExtendedReads() && otherRegionToTest.getExtendedLoc().overlapsP( readLoc ) ) { - otherRegionToTest.add( read ); + if ( otherRegionToTest.getLocation().overlapsP( firstReadLoc ) ) { + otherRegionToTest.add( firstRead ); + } else if ( walker.wantsExtendedReads() && otherRegionToTest.getExtendedLoc().overlapsP( firstReadLoc ) ) { + otherRegionToTest.add( firstRead ); } } } } - placedReads.add( read ); - // check for non-primary vs. extended - } else if( activeRegion.getLocation().overlapsP( readLoc ) ) { - if ( walker.wantsNonPrimaryReads() ) { - activeRegion.add( read ); - } - } else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) { - activeRegion.add( read ); - } - } - myReads.removeAll( placedReads ); // remove all the reads which have been placed into their active region - // WARNING: This hashset relies on reads being exactly equal when they are placed in the list as when they are removed. So the ActiveRegionWalker can't modify the reads in any way. - logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); - final M x = walker.map(activeRegion, null); - return walker.reduce( x, sum ); + // check for non-primary vs. extended + } else if( firstRegion.getLocation().overlapsP( firstReadLoc ) ) { + if ( walker.wantsNonPrimaryReads() ) { + firstRegion.add( firstRead ); + } + } else if( walker.wantsExtendedReads() && firstRegion.getExtendedLoc().overlapsP( firstReadLoc )) { + firstRegion.add( firstRead ); + } + + myReads.removeFirst(); + firstRead = myReads.peekFirst(); + firstReadLoc = this.engine.getGenomeLocParser().createGenomeLoc( firstRead ); + } + + logger.debug(">> Map call with " + firstRegion.getReads().size() + " " + (firstRegion.isActive ? "active" : "inactive") + " reads @ " + firstRegion.getLocation() + " with full extent: " + firstRegion.getReferenceLoc()); + final M x = walker.map( firstRegion, null ); + return walker.reduce(x, sum); } /** @@ -285,6 +303,7 @@ public class ExperimentalActiveRegionShardTraverseActiveRegions extends Tra * Ugly for now but will be cleaned up when we push this functionality more into the engine */ public T endTraversal( final Walker walker, T sum) { - return processActiveRegions((ActiveRegionWalker)walker, sum, Integer.MAX_VALUE, null); + boolean emptyQueue = true; + return processActiveRegions((ActiveRegionWalker)walker, sum, emptyQueue); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalReadShardTraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalReadShardTraverseActiveRegions.java index 672d37f7f..299ee4f56 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalReadShardTraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalReadShardTraverseActiveRegions.java @@ -40,7 +40,7 @@ public class ExperimentalReadShardTraverseActiveRegions extends TraversalE public T traverse( final ActiveRegionWalker walker, final ReadShardDataProvider readDataProvider, T sum) { - logger.debug(String.format("TraverseActiveRegion.traverse: Read Shard is %s", readDataProvider)); + logger.debug(String.format("ExperimentalReadShardTraverseActiveRegions.traverse: Read Shard is %s", readDataProvider)); final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension(); final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 33323ba67..2562fcccd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -43,7 +43,7 @@ public class TraverseActiveRegions extends TraversalEngine walker, final LocusShardDataProvider dataProvider, T sum) { - logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider)); + logger.debug(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider)); final LocusView locusView = new AllLocusView(dataProvider); diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java index 1204e639e..5051bc35f 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java @@ -4,6 +4,7 @@ import com.google.java.contract.PreconditionError; import net.sf.samtools.*; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; +import org.broadinstitute.sting.gatk.datasources.providers.ActiveRegionShardDataProvider; import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider; import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; @@ -32,6 +33,7 @@ import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.testng.Assert; +import org.testng.TestException; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; @@ -513,7 +515,15 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { providers.add(new ReadShardDataProvider(shard, genomeLocParser, shard.iterator(), reference, new ArrayList())); } break; - default: // other types not implemented + case ACTIVEREGIONSHARD: + activeRegionShardTraverse.initialize(engine); + for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new ActiveRegionShardBalancer())) { + for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs())) { + providers.add(new ActiveRegionShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, shard.iterator(), window.getLocus(), window, reference, new ArrayList())); + } + } + break; + default: throw new TestException("Invalid shard type"); } return providers; @@ -527,7 +537,10 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { case READSHARD: readShardTraverse.traverse(walker, (ReadShardDataProvider) dataProvider, i); break; - default: // other types not implemented + case ACTIVEREGIONSHARD: + activeRegionShardTraverse.traverse(walker, (ActiveRegionShardDataProvider) dataProvider, i); + break; + default: throw new TestException("Invalid shard type"); } } @@ -539,7 +552,10 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { case READSHARD: readShardTraverse.endTraversal(walker, i); break; - default: // other types not implemented + case ACTIVEREGIONSHARD: + activeRegionShardTraverse.endTraversal(walker, i); + break; + default: throw new TestException("Invalid shard type"); } } From fbee4c11f1bca8e530e892ae02b29dfd7e978367 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 2 Jan 2013 15:20:25 -0500 Subject: [PATCH 09/30] Unit tests for ProgressMeterData --- .../ProgressMeterDataUnitTest.java | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDataUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDataUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDataUnitTest.java new file mode 100644 index 000000000..d6ea3b227 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDataUnitTest.java @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.progressmeter; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.AutoFormattingTime; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.TimeUnit; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * UnitTests for the ProgressMeterData + * + * User: depristo + * Date: 8/24/12 + * Time: 11:25 AM + * To change this template use File | Settings | File Templates. + */ +public class ProgressMeterDataUnitTest extends BaseTest { + @Test + public void testBasic() { + Assert.assertEquals(new ProgressMeterData(1.0, 2, 3).getElapsedSeconds(), 1.0); + Assert.assertEquals(new ProgressMeterData(1.0, 2, 3).getUnitsProcessed(), 2); + Assert.assertEquals(new ProgressMeterData(1.0, 2, 3).getBpProcessed(), 3); + } + + @Test + public void testFraction() { + final double TOL = 1e-3; + Assert.assertEquals(new ProgressMeterData(1.0, 1, 1).calculateFractionGenomeTargetCompleted(10), 0.1, TOL); + Assert.assertEquals(new ProgressMeterData(1.0, 1, 2).calculateFractionGenomeTargetCompleted(10), 0.2, TOL); + Assert.assertEquals(new ProgressMeterData(1.0, 1, 1).calculateFractionGenomeTargetCompleted(100), 0.01, TOL); + Assert.assertEquals(new ProgressMeterData(1.0, 1, 2).calculateFractionGenomeTargetCompleted(100), 0.02, TOL); + Assert.assertEquals(new ProgressMeterData(1.0, 1, 1).calculateFractionGenomeTargetCompleted(0), 1.0, TOL); + } + + @Test + public void testSecondsPerBP() { + final double TOL = 1e-3; + final long M = 1000000; + Assert.assertEquals(new ProgressMeterData(1.0, 1, M).secondsPerMillionBP(), 1.0, TOL); + Assert.assertEquals(new ProgressMeterData(1.0, 1, M/10).secondsPerMillionBP(), 10.0, TOL); + Assert.assertEquals(new ProgressMeterData(2.0, 1, M).secondsPerMillionBP(), 2.0, TOL); + Assert.assertEquals(new ProgressMeterData(1.0, 1, 0).secondsPerMillionBP(), 1e6, TOL); + } + + @Test + public void testSecondsPerElement() { + final double TOL = 1e-3; + final long M = 1000000; + Assert.assertEquals(new ProgressMeterData(1.0, M, 1).secondsPerMillionElements(), 1.0, TOL); + Assert.assertEquals(new ProgressMeterData(1.0, M/10, 1).secondsPerMillionElements(), 10.0, TOL); + Assert.assertEquals(new ProgressMeterData(2.00, M, 1).secondsPerMillionElements(), 2.0, TOL); + Assert.assertEquals(new ProgressMeterData(1.0, 0, 1).secondsPerMillionElements(), 1e6, TOL); + } +} From 1ba8d47a81c94561feabef7740e2638b670a88ae Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 2 Jan 2013 16:12:08 -0500 Subject: [PATCH 10/30] Unit tests for ProgressMeterDaemon --- .../utils/progressmeter/ProgressMeter.java | 13 ++- .../progressmeter/ProgressMeterDaemon.java | 31 +++++- .../ProgressMeterDaemonUnitTest.java | 102 ++++++++++++++++++ 3 files changed, 142 insertions(+), 4 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java index 161335957..c9d849227 100755 --- a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java +++ b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java @@ -160,6 +160,13 @@ public class ProgressMeter { public ProgressMeter(final File performanceLogFile, final String processingUnitName, final GenomeLocSortedSet processingIntervals) { + this(performanceLogFile, processingUnitName, processingIntervals, ProgressMeterDaemon.DEFAULT_POLL_FREQUENCY_MILLISECONDS); + } + + protected ProgressMeter(final File performanceLogFile, + final String processingUnitName, + final GenomeLocSortedSet processingIntervals, + final long pollingFrequency) { if ( processingUnitName == null ) throw new IllegalArgumentException("processingUnitName cannot be null"); if ( processingIntervals == null ) throw new IllegalArgumentException("Target intervals cannot be null"); @@ -184,10 +191,14 @@ public class ProgressMeter { targetSizeInBP = processingIntervals.coveredSize(); // start up the timer - progressMeterDaemon = new ProgressMeterDaemon(this); + progressMeterDaemon = new ProgressMeterDaemon(this, pollingFrequency); start(); } + public ProgressMeterDaemon getProgressMeterDaemon() { + return progressMeterDaemon; + } + /** * Start up the progress meter, printing initialization message and starting up the * daemon thread for periodic printing. diff --git a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemon.java b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemon.java index 16887400a..7edd8e724 100644 --- a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemon.java +++ b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemon.java @@ -8,10 +8,20 @@ package org.broadinstitute.sting.utils.progressmeter; * Time: 9:16 PM */ public final class ProgressMeterDaemon extends Thread { + public final static long DEFAULT_POLL_FREQUENCY_MILLISECONDS = 10 * 1000; + /** * How frequently should we poll and print progress? */ - private final static long POLL_FREQUENCY_MILLISECONDS = 10 * 1000; + private final long pollFrequencyMilliseconds; + + /** + * How long are we waiting between print progress calls are issued? + * @return the time in milliseconds between progress meter calls + */ + private long getPollFrequencyMilliseconds() { + return pollFrequencyMilliseconds; + } /** * Are we to continue periodically printing status, or should we shut down? @@ -27,13 +37,20 @@ public final class ProgressMeterDaemon extends Thread { * Create a new ProgressMeterDaemon printing progress for meter * @param meter the progress meter to print progress of */ - public ProgressMeterDaemon(final ProgressMeter meter) { + public ProgressMeterDaemon(final ProgressMeter meter, final long pollFrequencyMilliseconds) { if ( meter == null ) throw new IllegalArgumentException("meter cannot be null"); + if ( pollFrequencyMilliseconds <= 0 ) throw new IllegalArgumentException("pollFrequencyMilliseconds must be greater than 0 but got " + pollFrequencyMilliseconds); + this.meter = meter; + this.pollFrequencyMilliseconds = pollFrequencyMilliseconds; setDaemon(true); setName("ProgressMeterDaemon"); } + public ProgressMeterDaemon(final ProgressMeter meter) { + this(meter, DEFAULT_POLL_FREQUENCY_MILLISECONDS); + } + /** * Tells this daemon thread to shutdown at the next opportunity, as the progress * metering is complete. @@ -42,6 +59,14 @@ public final class ProgressMeterDaemon extends Thread { this.done = true; } + /** + * Is this daemon thread done? + * @return true if done, false otherwise + */ + public boolean isDone() { + return done; + } + /** * Start up the ProgressMeterDaemon, polling every tens of seconds to print, if * necessary, the provided progress meter. Never exits until the JVM is complete, @@ -51,7 +76,7 @@ public final class ProgressMeterDaemon extends Thread { while (! done) { meter.printProgress(false); try { - Thread.sleep(POLL_FREQUENCY_MILLISECONDS); + Thread.sleep(getPollFrequencyMilliseconds()); } catch (InterruptedException e) { throw new RuntimeException(e); } diff --git a/public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java new file mode 100644 index 000000000..420db683e --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.progressmeter; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; + +/** + * UnitTests for the ProgressMeterDaemon + * + * User: depristo + * Date: 8/24/12 + * Time: 11:25 AM + * To change this template use File | Settings | File Templates. + */ +public class ProgressMeterDaemonUnitTest extends BaseTest { + private GenomeLocParser genomeLocParser; + + @BeforeClass + public void init() throws FileNotFoundException { + genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(b37KGReference))); + } + + // capture and count calls to progress + private class TestingProgressMeter extends ProgressMeter { + final List progressCalls = new LinkedList(); + + private TestingProgressMeter(final long poll) { + super(null, "test", new GenomeLocSortedSet(genomeLocParser), poll); + } + + @Override + protected synchronized void printProgress(boolean mustPrint) { + progressCalls.add(System.currentTimeMillis()); + } + } + + @DataProvider(name = "PollingData") + public Object[][] makePollingData() { + List tests = new ArrayList(); + for ( final int ticks : Arrays.asList(1, 5, 10) ) { + for ( final int poll : Arrays.asList(10, 100) ) { + tests.add(new Object[]{poll, ticks}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "PollingData", invocationCount = 10, successPercentage = 90) + public void testMe(final long poll, final int ticks) throws InterruptedException { + final TestingProgressMeter meter = new TestingProgressMeter(poll); + final ProgressMeterDaemon daemon = meter.getProgressMeterDaemon(); + Assert.assertTrue(daemon.isDaemon()); + + Assert.assertFalse(daemon.isDone()); + Thread.sleep(ticks * poll); + Assert.assertFalse(daemon.isDone()); + + daemon.done(); + Assert.assertTrue(daemon.isDone()); + + Assert.assertEquals(meter.progressCalls.size(), ticks, + "Expected " + ticks + " progress calls from daemon thread, but only got " + meter.progressCalls.size() + " with exact calls " + meter.progressCalls); + } +} From 7df47418d839182cabd90839cea94da400d1d8eb Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 3 Jan 2013 14:43:36 -0500 Subject: [PATCH 11/30] BQSR optimization: make RecalibrationTables thread-local, and merge results in onTraversalDone -- With the newer, faster BQSR, scaling was limited by the NestedIntegerArray. The solution to this is to make the entire table thread-local, so that each nct thread has its own data and doesn't have any collisions. -- Removed the previous partial solution of having a thread-local quality score table -- Added a new argument -lowMemory --- .../bqsr/AdvancedRecalibrationEngine.java | 42 +------- .../gatk/walkers/bqsr/BaseRecalibrator.java | 31 +++--- .../walkers/bqsr/RecalibrationEngine.java | 11 ++- .../bqsr/StandardRecalibrationEngine.java | 99 ++++++++++++++++--- .../recalibration/RecalibrationTables.java | 16 +-- 5 files changed, 130 insertions(+), 69 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java index 255f1fd05..3871101eb 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java @@ -25,35 +25,21 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; * OTHER DEALINGS IN THE SOFTWARE. */ -import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.classloader.ProtectedPackageSource; import org.broadinstitute.sting.utils.collections.NestedIntegerArray; import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.recalibration.ReadCovariates; import org.broadinstitute.sting.utils.recalibration.RecalDatum; +import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import java.util.LinkedList; -import java.util.List; - public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine implements ProtectedPackageSource { - private final static Logger logger = Logger.getLogger(AdvancedRecalibrationEngine.class); - - final List> allThreadLocalQualityScoreTables = new LinkedList>(); - private ThreadLocal> threadLocalQualityScoreTables = new ThreadLocal>() { - @Override - protected synchronized NestedIntegerArray initialValue() { - final NestedIntegerArray table = recalibrationTables.makeQualityScoreTable(); - allThreadLocalQualityScoreTables.add(table); - return table; - } - }; - @Override public void updateDataForRead( final ReadRecalibrationInfo recalInfo ) { final GATKSAMRecord read = recalInfo.getRead(); final ReadCovariates readCovariates = recalInfo.getCovariatesValues(); - final NestedIntegerArray qualityScoreTable = getThreadLocalQualityScoreTable(); + final RecalibrationTables tables = getRecalibrationTables(); + final NestedIntegerArray qualityScoreTable = tables.getQualityScoreTable(); for( int offset = 0; offset < read.getReadBases().length; offset++ ) { if( ! recalInfo.skip(offset) ) { @@ -70,30 +56,10 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp if (keys[i] < 0) continue; - incrementDatumOrPutIfNecessary(recalibrationTables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex); + incrementDatumOrPutIfNecessary(tables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex); } } } } } - - /** - * Get a NestedIntegerArray for a QualityScore table specific to this thread - * @return a non-null NestedIntegerArray ready to be used to collect calibration info for the quality score covariate - */ - private NestedIntegerArray getThreadLocalQualityScoreTable() { - return threadLocalQualityScoreTables.get(); - } - - @Override - public void finalizeData() { - // merge in all of the thread local tables - logger.info("Merging " + allThreadLocalQualityScoreTables.size() + " thread-local quality score tables"); - for ( final NestedIntegerArray localTable : allThreadLocalQualityScoreTables ) { - recalibrationTables.combineQualityScoreTable(localTable); - } - allThreadLocalQualityScoreTables.clear(); // cleanup after ourselves - - super.finalizeData(); - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index 7692c58e2..ffcfd6233 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -120,9 +120,16 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche @Argument(fullName = "bqsrBAQGapOpenPenalty", shortName="bqsrBAQGOP", doc="BQSR BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false) public double BAQGOP = BAQ.DEFAULT_GOP; - private QuantizationInfo quantizationInfo; // an object that keeps track of the information necessary for quality score quantization + /** + * When you have nct > 1, BQSR uses nct times more memory to compute its recalibration tables, for efficiency + * purposes. If you have many covariates, and therefore are using a lot of memory, you can use this flag + * to safely access only one table. There may be some CPU cost, but as long as the table is really big + * there should be relatively little CPU costs. + */ + @Argument(fullName = "lowMemoryMode", shortName="lowMemoryMode", doc="Reduce memory usage in multi-threaded code at the expense of threading efficiency", required = false) + public boolean lowMemoryMode = false; - private RecalibrationTables recalibrationTables; + private QuantizationInfo quantizationInfo; // an object that keeps track of the information necessary for quality score quantization private Covariate[] requestedCovariates; // list to hold the all the covariate objects that were requested (required + standard + experimental) @@ -130,8 +137,6 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche private int minimumQToUse; - protected static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\ - private static final String NO_DBSNP_EXCEPTION = "This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation."; private BAQ baq; // BAQ the reads on the fly to generate the alignment uncertainty vector @@ -143,7 +148,6 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche * Based on the covariates' estimates for initial capacity allocate the data hashmap */ public void initialize() { - baq = new BAQ(BAQGOP); // setup the BAQ object with the provided gap open penalty // check for unsupported access @@ -188,10 +192,11 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche int numReadGroups = 0; for ( final SAMFileHeader header : getToolkit().getSAMFileHeaders() ) numReadGroups += header.getReadGroups().size(); - recalibrationTables = new RecalibrationTables(requestedCovariates, numReadGroups, RAC.RECAL_TABLE_UPDATE_LOG); recalibrationEngine = initializeRecalibrationEngine(); - recalibrationEngine.initialize(requestedCovariates, recalibrationTables); + recalibrationEngine.initialize(requestedCovariates, numReadGroups, RAC.RECAL_TABLE_UPDATE_LOG); + if ( lowMemoryMode ) + recalibrationEngine.enableLowMemoryMode(); minimumQToUse = getToolkit().getArguments().PRESERVE_QSCORES_LESS_THAN; referenceReader = getToolkit().getReferenceDataSource().getReference(); @@ -501,14 +506,18 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche logger.info("Processed: " + result + " reads"); } + private RecalibrationTables getRecalibrationTable() { + return recalibrationEngine.getFinalRecalibrationTables(); + } + private void generatePlots() { File recalFile = getToolkit().getArguments().BQSR_RECAL_FILE; if (recalFile != null) { RecalibrationReport report = new RecalibrationReport(recalFile); - RecalUtils.generateRecalibrationPlot(RAC, report.getRecalibrationTables(), recalibrationTables, requestedCovariates); + RecalUtils.generateRecalibrationPlot(RAC, report.getRecalibrationTables(), getRecalibrationTable(), requestedCovariates); } else - RecalUtils.generateRecalibrationPlot(RAC, recalibrationTables, requestedCovariates); + RecalUtils.generateRecalibrationPlot(RAC, getRecalibrationTable(), requestedCovariates); } /** @@ -517,10 +526,10 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche * generate a quantization map (recalibrated_qual -> quantized_qual) */ private void quantizeQualityScores() { - quantizationInfo = new QuantizationInfo(recalibrationTables, RAC.QUANTIZING_LEVELS); + quantizationInfo = new QuantizationInfo(getRecalibrationTable(), RAC.QUANTIZING_LEVELS); } private void generateReport() { - RecalUtils.outputRecalibrationReport(RAC, quantizationInfo, recalibrationTables, requestedCovariates, RAC.SORT_BY_ALL_COLUMNS); + RecalUtils.outputRecalibrationReport(RAC, quantizationInfo, getRecalibrationTable(), requestedCovariates, RAC.SORT_BY_ALL_COLUMNS); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java index 5c002b7e5..6c3189be5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java @@ -5,6 +5,8 @@ import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import java.io.PrintStream; + /* * Copyright (c) 2009 The Broad Institute * @@ -40,9 +42,10 @@ public interface RecalibrationEngine { * The engine should collect match and mismatch data into the recalibrationTables data. * * @param covariates an array of the covariates we'll be using in this engine, order matters - * @param recalibrationTables the destination recalibrationTables where stats should be collected + * @param numReadGroups the number of read groups we should use for the recalibration tables + * @param maybeLogStream an optional print stream for logging calls to the nestedhashmap in the recalibration tables */ - public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables); + public void initialize(final Covariate[] covariates, final int numReadGroups, final PrintStream maybeLogStream); /** * Update the recalibration statistics using the information in recalInfo @@ -57,4 +60,8 @@ public interface RecalibrationEngine { * Called once after all calls to updateDataForRead have been issued. */ public void finalizeData(); + + public void enableLowMemoryMode(); + + public RecalibrationTables getFinalRecalibrationTables(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java index a6ab98e8b..0cd042eeb 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java @@ -25,26 +25,64 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; * OTHER DEALINGS IN THE SOFTWARE. */ +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.classloader.PublicPackageSource; import org.broadinstitute.sting.utils.collections.NestedIntegerArray; -import org.broadinstitute.sting.utils.recalibration.EventType; -import org.broadinstitute.sting.utils.recalibration.ReadCovariates; -import org.broadinstitute.sting.utils.recalibration.RecalDatum; -import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; +import org.broadinstitute.sting.utils.recalibration.*; import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import java.io.PrintStream; +import java.util.LinkedList; +import java.util.List; + public class StandardRecalibrationEngine implements RecalibrationEngine, PublicPackageSource { + private static final Logger logger = Logger.getLogger(StandardRecalibrationEngine.class); protected Covariate[] covariates; - protected RecalibrationTables recalibrationTables; + private int numReadGroups; + private PrintStream maybeLogStream; + private boolean lowMemoryMode = false; + + private boolean finalized = false; + private RecalibrationTables mergedRecalibrationTables = null; + + private final List recalibrationTablesList = new LinkedList(); + + private final ThreadLocal threadLocalTables = new ThreadLocal() { + private synchronized RecalibrationTables makeAndCaptureTable() { + logger.info("Creating RecalibrationTable for " + Thread.currentThread()); + final RecalibrationTables newTable = new RecalibrationTables(covariates, numReadGroups, maybeLogStream); + recalibrationTablesList.add(newTable); + return newTable; + } + + @Override + protected synchronized RecalibrationTables initialValue() { + if ( lowMemoryMode ) { + return recalibrationTablesList.isEmpty() ? makeAndCaptureTable() : recalibrationTablesList.get(0); + } else { + return makeAndCaptureTable(); + } + } + }; + + protected RecalibrationTables getRecalibrationTables() { + return threadLocalTables.get(); + } + + public void enableLowMemoryMode() { + this.lowMemoryMode = true; + } @Override - public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables) { + public void initialize(final Covariate[] covariates, final int numReadGroups, final PrintStream maybeLogStream) { if ( covariates == null ) throw new IllegalArgumentException("Covariates cannot be null"); - if ( recalibrationTables == null ) throw new IllegalArgumentException("recalibrationTables cannot be null"); + if ( numReadGroups < 1 ) throw new IllegalArgumentException("numReadGroups must be >= 1 but got " + numReadGroups); this.covariates = covariates.clone(); - this.recalibrationTables = recalibrationTables; + this.numReadGroups = numReadGroups; + this.maybeLogStream = maybeLogStream; } @Override @@ -59,13 +97,13 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP final double isError = recalInfo.getErrorFraction(eventType, offset); final int[] keys = readCovariates.getKeySet(offset, eventType); - incrementDatumOrPutIfNecessary(recalibrationTables.getQualityScoreTable(), qual, isError, keys[0], keys[1], eventType.index); + incrementDatumOrPutIfNecessary(getRecalibrationTables().getQualityScoreTable(), qual, isError, keys[0], keys[1], eventType.index); for (int i = 2; i < covariates.length; i++) { if (keys[i] < 0) continue; - incrementDatumOrPutIfNecessary(recalibrationTables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventType.index); + incrementDatumOrPutIfNecessary(getRecalibrationTables().getTable(i), qual, isError, keys[0], keys[1], keys[i], eventType.index); } } } @@ -90,8 +128,13 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP */ @Override public void finalizeData() { - final NestedIntegerArray byReadGroupTable = recalibrationTables.getReadGroupTable(); - final NestedIntegerArray byQualTable = recalibrationTables.getQualityScoreTable(); + if ( finalized ) throw new IllegalStateException("FinalizeData() has already been called"); + + // merge all of the thread-local tables + mergedRecalibrationTables = mergeThreadLocalRecalibrationTables(); + + final NestedIntegerArray byReadGroupTable = mergedRecalibrationTables.getReadGroupTable(); + final NestedIntegerArray byQualTable = mergedRecalibrationTables.getQualityScoreTable(); // iterate over all values in the qual table for ( NestedIntegerArray.Leaf leaf : byQualTable.getAllLeaves() ) { @@ -108,6 +151,38 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP rgDatum.combine(qualDatum); } } + + finalized = true; + } + + /** + * Merge all of the thread local recalibration tables into a single one. + * + * Reuses one of the recalibration tables to hold the merged table, so this function can only be + * called once in the engine. + * + * @return the merged recalibration table + */ + @Requires("! finalized") + private RecalibrationTables mergeThreadLocalRecalibrationTables() { + if ( recalibrationTablesList.isEmpty() ) throw new IllegalStateException("recalibration tables list is empty"); + + RecalibrationTables merged = null; + for ( final RecalibrationTables table : recalibrationTablesList ) { + if ( merged == null ) + // fast path -- if there's only only one table, so just make it the merged one + merged = table; + else { + merged.combine(table); + } + } + + return merged; + } + + public RecalibrationTables getFinalRecalibrationTables() { + if ( ! finalized ) throw new IllegalStateException("Cannot get final recalibration tables until finalizeData() has been called"); + return mergedRecalibrationTables; } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java index 3f968d7f6..a6b1e13b9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java @@ -123,12 +123,16 @@ public final class RecalibrationTables { } /** - * Merge in the quality score table information from qualityScoreTable into this - * recalibration table's quality score table. - * - * @param qualityScoreTable the quality score table we want to merge in + * Merge all of the tables from toMerge into into this set of tables */ - public void combineQualityScoreTable(final NestedIntegerArray qualityScoreTable) { - RecalUtils.combineTables(getQualityScoreTable(), qualityScoreTable); + public void combine(final RecalibrationTables toMerge) { + if ( numTables() != toMerge.numTables() ) + throw new IllegalArgumentException("Attempting to merge RecalibrationTables with different sizes"); + + for ( int i = 0; i < numTables(); i++ ) { + final NestedIntegerArray myTable = this.getTable(i); + final NestedIntegerArray otherTable = toMerge.getTable(i); + RecalUtils.combineTables(myTable, otherTable); + } } } From bbdf9ee91b895d353260f2b5219d1b3d6c3c3ce4 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 3 Jan 2013 16:47:45 -0500 Subject: [PATCH 12/30] BQSR cleanup: merge Advanced and Standard recalibration engine into just the RecalibrationEngine -- As we are no longer maintaining a public/protected system we need only have one RecalibrationEngine. -- Misc. code cleanup and docs along the way --- .../bqsr/AdvancedRecalibrationEngine.java | 65 ----- .../gatk/walkers/bqsr/BaseRecalibrator.java | 51 ++-- .../walkers/bqsr/RecalibrationEngine.java | 254 +++++++++++++++--- .../bqsr/StandardRecalibrationEngine.java | 219 --------------- 4 files changed, 249 insertions(+), 340 deletions(-) delete mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java deleted file mode 100644 index 3871101eb..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java +++ /dev/null @@ -1,65 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.bqsr; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -import org.broadinstitute.sting.utils.classloader.ProtectedPackageSource; -import org.broadinstitute.sting.utils.collections.NestedIntegerArray; -import org.broadinstitute.sting.utils.recalibration.EventType; -import org.broadinstitute.sting.utils.recalibration.ReadCovariates; -import org.broadinstitute.sting.utils.recalibration.RecalDatum; -import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine implements ProtectedPackageSource { - @Override - public void updateDataForRead( final ReadRecalibrationInfo recalInfo ) { - final GATKSAMRecord read = recalInfo.getRead(); - final ReadCovariates readCovariates = recalInfo.getCovariatesValues(); - final RecalibrationTables tables = getRecalibrationTables(); - final NestedIntegerArray qualityScoreTable = tables.getQualityScoreTable(); - - for( int offset = 0; offset < read.getReadBases().length; offset++ ) { - if( ! recalInfo.skip(offset) ) { - - for (final EventType eventType : EventType.values()) { - final int[] keys = readCovariates.getKeySet(offset, eventType); - final int eventIndex = eventType.index; - final byte qual = recalInfo.getQual(eventType, offset); - final double isError = recalInfo.getErrorFraction(eventType, offset); - - incrementDatumOrPutIfNecessary(qualityScoreTable, qual, isError, keys[0], keys[1], eventIndex); - - for (int i = 2; i < covariates.length; i++) { - if (keys[i] < 0) - continue; - - incrementDatumOrPutIfNecessary(tables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex); - } - } - } - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index ffcfd6233..2c774d94d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -113,12 +113,11 @@ import java.util.List; @ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class, UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class}) @PartitionBy(PartitionType.READ) public class BaseRecalibrator extends ReadWalker implements NanoSchedulable { + /** + * all the command line arguments for BQSR and it's covariates + */ @ArgumentCollection - private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); // all the command line arguments for BQSR and it's covariates - - @Advanced - @Argument(fullName = "bqsrBAQGapOpenPenalty", shortName="bqsrBAQGOP", doc="BQSR BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false) - public double BAQGOP = BAQ.DEFAULT_GOP; + private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); /** * When you have nct > 1, BQSR uses nct times more memory to compute its recalibration tables, for efficiency @@ -129,9 +128,19 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche @Argument(fullName = "lowMemoryMode", shortName="lowMemoryMode", doc="Reduce memory usage in multi-threaded code at the expense of threading efficiency", required = false) public boolean lowMemoryMode = false; - private QuantizationInfo quantizationInfo; // an object that keeps track of the information necessary for quality score quantization + @Advanced + @Argument(fullName = "bqsrBAQGapOpenPenalty", shortName="bqsrBAQGOP", doc="BQSR BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false) + public double BAQGOP = BAQ.DEFAULT_GOP; - private Covariate[] requestedCovariates; // list to hold the all the covariate objects that were requested (required + standard + experimental) + /** + * an object that keeps track of the information necessary for quality score quantization + */ + private QuantizationInfo quantizationInfo; + + /** + * list to hold the all the covariate objects that were requested (required + standard + experimental) + */ + private Covariate[] requestedCovariates; private RecalibrationEngine recalibrationEngine; @@ -189,30 +198,20 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche throw new UserException.CouldNotCreateOutputFile(RAC.RECAL_TABLE_FILE, e); } - int numReadGroups = 0; - for ( final SAMFileHeader header : getToolkit().getSAMFileHeaders() ) - numReadGroups += header.getReadGroups().size(); - - recalibrationEngine = initializeRecalibrationEngine(); - recalibrationEngine.initialize(requestedCovariates, numReadGroups, RAC.RECAL_TABLE_UPDATE_LOG); - if ( lowMemoryMode ) - recalibrationEngine.enableLowMemoryMode(); - + initializeRecalibrationEngine(); minimumQToUse = getToolkit().getArguments().PRESERVE_QSCORES_LESS_THAN; referenceReader = getToolkit().getReferenceDataSource().getReference(); } - private RecalibrationEngine initializeRecalibrationEngine() { + /** + * Initialize the recalibration engine + */ + private void initializeRecalibrationEngine() { + int numReadGroups = 0; + for ( final SAMFileHeader header : getToolkit().getSAMFileHeaders() ) + numReadGroups += header.getReadGroups().size(); - final Class recalibrationEngineClass = GATKLiteUtils.getProtectedClassIfAvailable(RecalibrationEngine.class); - try { - final Constructor constructor = recalibrationEngineClass.getDeclaredConstructor((Class[])null); - constructor.setAccessible(true); - return (RecalibrationEngine)constructor.newInstance(); - } - catch (Exception e) { - throw new ReviewedStingException("Unable to create RecalibrationEngine class instance " + recalibrationEngineClass.getSimpleName()); - } + recalibrationEngine = new RecalibrationEngine(requestedCovariates, numReadGroups, RAC.RECAL_TABLE_UPDATE_LOG, lowMemoryMode); } private boolean isLowQualityBase( final GATKSAMRecord read, final int offset ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java index 6c3189be5..c6d5cddb9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java @@ -1,37 +1,90 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.walkers.bqsr; import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.collections.NestedIntegerArray; +import org.broadinstitute.sting.utils.recalibration.EventType; +import org.broadinstitute.sting.utils.recalibration.ReadCovariates; +import org.broadinstitute.sting.utils.recalibration.RecalDatum; import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.io.PrintStream; +import java.util.LinkedList; +import java.util.List; + +public class RecalibrationEngine { + final protected Covariate[] covariates; + final private int numReadGroups; + final private PrintStream maybeLogStream; + final private boolean lowMemoryMode; + + /** + * Has finalizeData() been called? + */ + private boolean finalized = false; + + /** + * The final (merged, etc) recalibration tables, suitable for downstream analysis. + */ + private RecalibrationTables finalRecalibrationTables = null; + + private final List recalibrationTablesList = new LinkedList(); + + private final ThreadLocal threadLocalTables = new ThreadLocal() { + private synchronized RecalibrationTables makeAndCaptureTable() { + final RecalibrationTables newTable = new RecalibrationTables(covariates, numReadGroups, maybeLogStream); + recalibrationTablesList.add(newTable); + return newTable; + } + + @Override + protected synchronized RecalibrationTables initialValue() { + if ( lowMemoryMode ) { + return recalibrationTablesList.isEmpty() ? makeAndCaptureTable() : recalibrationTablesList.get(0); + } else { + return makeAndCaptureTable(); + } + } + }; + + /** + * Get a recalibration table suitable for updating the underlying RecalDatums + * + * May return a thread-local version, or a single version, depending on the initialization + * arguments of this instance. + * + * @return + */ + protected RecalibrationTables getUpdatableRecalibrationTables() { + return threadLocalTables.get(); + } -/* -* Copyright (c) 2009 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE. -*/ -public interface RecalibrationEngine { /** * Initialize the recalibration engine * @@ -45,23 +98,164 @@ public interface RecalibrationEngine { * @param numReadGroups the number of read groups we should use for the recalibration tables * @param maybeLogStream an optional print stream for logging calls to the nestedhashmap in the recalibration tables */ - public void initialize(final Covariate[] covariates, final int numReadGroups, final PrintStream maybeLogStream); + public RecalibrationEngine(final Covariate[] covariates, final int numReadGroups, final PrintStream maybeLogStream, final boolean enableLowMemoryMode) { + if ( covariates == null ) throw new IllegalArgumentException("Covariates cannot be null"); + if ( numReadGroups < 1 ) throw new IllegalArgumentException("numReadGroups must be >= 1 but got " + numReadGroups); + + this.covariates = covariates.clone(); + this.numReadGroups = numReadGroups; + this.maybeLogStream = maybeLogStream; + this.lowMemoryMode = enableLowMemoryMode; + } /** * Update the recalibration statistics using the information in recalInfo * @param recalInfo data structure holding information about the recalibration values for a single read */ @Requires("recalInfo != null") - public void updateDataForRead(final ReadRecalibrationInfo recalInfo); + public void updateDataForRead( final ReadRecalibrationInfo recalInfo ) { + final GATKSAMRecord read = recalInfo.getRead(); + final ReadCovariates readCovariates = recalInfo.getCovariatesValues(); + final RecalibrationTables tables = getUpdatableRecalibrationTables(); + final NestedIntegerArray qualityScoreTable = tables.getQualityScoreTable(); + + for( int offset = 0; offset < read.getReadBases().length; offset++ ) { + if( ! recalInfo.skip(offset) ) { + + for (final EventType eventType : EventType.values()) { + final int[] keys = readCovariates.getKeySet(offset, eventType); + final int eventIndex = eventType.index; + final byte qual = recalInfo.getQual(eventType, offset); + final double isError = recalInfo.getErrorFraction(eventType, offset); + + incrementDatumOrPutIfNecessary(qualityScoreTable, qual, isError, keys[0], keys[1], eventIndex); + + for (int i = 2; i < covariates.length; i++) { + if (keys[i] < 0) + continue; + + incrementDatumOrPutIfNecessary(tables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex); + } + } + } + } + } + + /** + * creates a datum object with one observation and one or zero error + * + * @param reportedQual the quality score reported by the instrument for this base + * @param isError whether or not the observation is an error + * @return a new RecalDatum object with the observation and the error + */ + protected RecalDatum createDatumObject(final byte reportedQual, final double isError) { + return new RecalDatum(1, isError, reportedQual); + } /** * Finalize, if appropriate, all derived data in recalibrationTables. * * Called once after all calls to updateDataForRead have been issued. + * + * Assumes that all of the principal tables (by quality score) have been completely updated, + * and walks over this data to create summary data tables like by read group table. */ - public void finalizeData(); + public void finalizeData() { + if ( finalized ) throw new IllegalStateException("FinalizeData() has already been called"); - public void enableLowMemoryMode(); + // merge all of the thread-local tables + finalRecalibrationTables = mergeThreadLocalRecalibrationTables(); - public RecalibrationTables getFinalRecalibrationTables(); + final NestedIntegerArray byReadGroupTable = finalRecalibrationTables.getReadGroupTable(); + final NestedIntegerArray byQualTable = finalRecalibrationTables.getQualityScoreTable(); + + // iterate over all values in the qual table + for ( NestedIntegerArray.Leaf leaf : byQualTable.getAllLeaves() ) { + final int rgKey = leaf.keys[0]; + final int eventIndex = leaf.keys[2]; + final RecalDatum rgDatum = byReadGroupTable.get(rgKey, eventIndex); + final RecalDatum qualDatum = leaf.value; + + if ( rgDatum == null ) { + // create a copy of qualDatum, and initialize byReadGroup table with it + byReadGroupTable.put(new RecalDatum(qualDatum), rgKey, eventIndex); + } else { + // combine the qual datum with the existing datum in the byReadGroup table + rgDatum.combine(qualDatum); + } + } + + finalized = true; + } + + /** + * Merge all of the thread local recalibration tables into a single one. + * + * Reuses one of the recalibration tables to hold the merged table, so this function can only be + * called once in the engine. + * + * @return the merged recalibration table + */ + @Requires("! finalized") + private RecalibrationTables mergeThreadLocalRecalibrationTables() { + if ( recalibrationTablesList.isEmpty() ) throw new IllegalStateException("recalibration tables list is empty"); + + RecalibrationTables merged = null; + for ( final RecalibrationTables table : recalibrationTablesList ) { + if ( merged == null ) + // fast path -- if there's only only one table, so just make it the merged one + merged = table; + else { + merged.combine(table); + } + } + + return merged; + } + + /** + * Get the final recalibration tables, after finalizeData() has been called + * + * This returns the finalized recalibration table collected by this engine. + * + * It is an error to call this function before finalizeData has been called + * + * @return the finalized recalibration table collected by this engine + */ + public RecalibrationTables getFinalRecalibrationTables() { + if ( ! finalized ) throw new IllegalStateException("Cannot get final recalibration tables until finalizeData() has been called"); + return finalRecalibrationTables; + } + + /** + * Increments the RecalDatum at the specified position in the specified table, or put a new item there + * if there isn't already one. + * + * Does this in a thread-safe way WITHOUT being synchronized: relies on the behavior of NestedIntegerArray.put() + * to return false if another thread inserts a new item at our position in the middle of our put operation. + * + * @param table the table that holds/will hold our item + * @param qual qual for this event + * @param isError error value for this event + * @param keys location in table of our item + */ + protected void incrementDatumOrPutIfNecessary( final NestedIntegerArray table, + final byte qual, + final double isError, + final int... keys ) { + final RecalDatum existingDatum = table.get(keys); + + if ( existingDatum == null ) { + // No existing item, try to put a new one + if ( ! table.put(createDatumObject(qual, isError), keys) ) { + // Failed to put a new item because another thread came along and put an item here first. + // Get the newly-put item and increment it (item is guaranteed to exist at this point) + table.get(keys).increment(1.0, isError); + } + } + else { + // Easy case: already an item here, so increment it + existingDatum.increment(1.0, isError); + } + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java deleted file mode 100644 index 0cd042eeb..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java +++ /dev/null @@ -1,219 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.bqsr; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.classloader.PublicPackageSource; -import org.broadinstitute.sting.utils.collections.NestedIntegerArray; -import org.broadinstitute.sting.utils.recalibration.*; -import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.io.PrintStream; -import java.util.LinkedList; -import java.util.List; - -public class StandardRecalibrationEngine implements RecalibrationEngine, PublicPackageSource { - private static final Logger logger = Logger.getLogger(StandardRecalibrationEngine.class); - protected Covariate[] covariates; - private int numReadGroups; - private PrintStream maybeLogStream; - private boolean lowMemoryMode = false; - - private boolean finalized = false; - private RecalibrationTables mergedRecalibrationTables = null; - - private final List recalibrationTablesList = new LinkedList(); - - private final ThreadLocal threadLocalTables = new ThreadLocal() { - private synchronized RecalibrationTables makeAndCaptureTable() { - logger.info("Creating RecalibrationTable for " + Thread.currentThread()); - final RecalibrationTables newTable = new RecalibrationTables(covariates, numReadGroups, maybeLogStream); - recalibrationTablesList.add(newTable); - return newTable; - } - - @Override - protected synchronized RecalibrationTables initialValue() { - if ( lowMemoryMode ) { - return recalibrationTablesList.isEmpty() ? makeAndCaptureTable() : recalibrationTablesList.get(0); - } else { - return makeAndCaptureTable(); - } - } - }; - - protected RecalibrationTables getRecalibrationTables() { - return threadLocalTables.get(); - } - - public void enableLowMemoryMode() { - this.lowMemoryMode = true; - } - - @Override - public void initialize(final Covariate[] covariates, final int numReadGroups, final PrintStream maybeLogStream) { - if ( covariates == null ) throw new IllegalArgumentException("Covariates cannot be null"); - if ( numReadGroups < 1 ) throw new IllegalArgumentException("numReadGroups must be >= 1 but got " + numReadGroups); - - this.covariates = covariates.clone(); - this.numReadGroups = numReadGroups; - this.maybeLogStream = maybeLogStream; - } - - @Override - public void updateDataForRead( final ReadRecalibrationInfo recalInfo ) { - final GATKSAMRecord read = recalInfo.getRead(); - final EventType eventType = EventType.BASE_SUBSTITUTION; - final ReadCovariates readCovariates = recalInfo.getCovariatesValues(); - - for( int offset = 0; offset < read.getReadBases().length; offset++ ) { - if( ! recalInfo.skip(offset) ) { - final byte qual = recalInfo.getQual(eventType, offset); - final double isError = recalInfo.getErrorFraction(eventType, offset); - final int[] keys = readCovariates.getKeySet(offset, eventType); - - incrementDatumOrPutIfNecessary(getRecalibrationTables().getQualityScoreTable(), qual, isError, keys[0], keys[1], eventType.index); - - for (int i = 2; i < covariates.length; i++) { - if (keys[i] < 0) - continue; - - incrementDatumOrPutIfNecessary(getRecalibrationTables().getTable(i), qual, isError, keys[0], keys[1], keys[i], eventType.index); - } - } - } - } - - /** - * creates a datum object with one observation and one or zero error - * - * @param reportedQual the quality score reported by the instrument for this base - * @param isError whether or not the observation is an error - * @return a new RecalDatum object with the observation and the error - */ - protected RecalDatum createDatumObject(final byte reportedQual, final double isError) { - return new RecalDatum(1, isError, reportedQual); - } - - /** - * Create derived recalibration data tables - * - * Assumes that all of the principal tables (by quality score) have been completely updated, - * and walks over this data to create summary data tables like by read group table. - */ - @Override - public void finalizeData() { - if ( finalized ) throw new IllegalStateException("FinalizeData() has already been called"); - - // merge all of the thread-local tables - mergedRecalibrationTables = mergeThreadLocalRecalibrationTables(); - - final NestedIntegerArray byReadGroupTable = mergedRecalibrationTables.getReadGroupTable(); - final NestedIntegerArray byQualTable = mergedRecalibrationTables.getQualityScoreTable(); - - // iterate over all values in the qual table - for ( NestedIntegerArray.Leaf leaf : byQualTable.getAllLeaves() ) { - final int rgKey = leaf.keys[0]; - final int eventIndex = leaf.keys[2]; - final RecalDatum rgDatum = byReadGroupTable.get(rgKey, eventIndex); - final RecalDatum qualDatum = leaf.value; - - if ( rgDatum == null ) { - // create a copy of qualDatum, and initialize byReadGroup table with it - byReadGroupTable.put(new RecalDatum(qualDatum), rgKey, eventIndex); - } else { - // combine the qual datum with the existing datum in the byReadGroup table - rgDatum.combine(qualDatum); - } - } - - finalized = true; - } - - /** - * Merge all of the thread local recalibration tables into a single one. - * - * Reuses one of the recalibration tables to hold the merged table, so this function can only be - * called once in the engine. - * - * @return the merged recalibration table - */ - @Requires("! finalized") - private RecalibrationTables mergeThreadLocalRecalibrationTables() { - if ( recalibrationTablesList.isEmpty() ) throw new IllegalStateException("recalibration tables list is empty"); - - RecalibrationTables merged = null; - for ( final RecalibrationTables table : recalibrationTablesList ) { - if ( merged == null ) - // fast path -- if there's only only one table, so just make it the merged one - merged = table; - else { - merged.combine(table); - } - } - - return merged; - } - - public RecalibrationTables getFinalRecalibrationTables() { - if ( ! finalized ) throw new IllegalStateException("Cannot get final recalibration tables until finalizeData() has been called"); - return mergedRecalibrationTables; - } - - /** - * Increments the RecalDatum at the specified position in the specified table, or put a new item there - * if there isn't already one. - * - * Does this in a thread-safe way WITHOUT being synchronized: relies on the behavior of NestedIntegerArray.put() - * to return false if another thread inserts a new item at our position in the middle of our put operation. - * - * @param table the table that holds/will hold our item - * @param qual qual for this event - * @param isError error value for this event - * @param keys location in table of our item - */ - protected void incrementDatumOrPutIfNecessary( final NestedIntegerArray table, - final byte qual, - final double isError, - final int... keys ) { - final RecalDatum existingDatum = table.get(keys); - - if ( existingDatum == null ) { - // No existing item, try to put a new one - if ( ! table.put(createDatumObject(qual, isError), keys) ) { - // Failed to put a new item because another thread came along and put an item here first. - // Get the newly-put item and increment it (item is guaranteed to exist at this point) - table.get(keys).increment(1.0, isError); - } - } - else { - // Easy case: already an item here, so increment it - existingDatum.increment(1.0, isError); - } - } -} From a5901cdd2037833568fa13e38f74952e261cc80a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 4 Jan 2013 10:54:02 -0500 Subject: [PATCH 14/30] Bugfix for printProgress in TraverseReadsNano -- Must provide a single bp position (1:10) not the range of the read (1:1-50). ProgressMeter now checks at runtime for this problem as well. --- .../sting/gatk/traversals/TraverseReadsNano.java | 3 ++- .../sting/utils/progressmeter/ProgressMeter.java | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java index ee71d82bb..aa33def62 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java @@ -65,7 +65,8 @@ public class TraverseReadsNano extends TraversalEngine, @Override public void progress(MapData lastProcessedMap) { if ( lastProcessedMap.refContext != null ) - printProgress(lastProcessedMap.refContext.getLocus()); + // note, need to use getStopLocation so we don't give an interval to ProgressMeterDaemon + printProgress(lastProcessedMap.refContext.getLocus().getStopLocation()); } }); } diff --git a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java index c9d849227..b36326722 100755 --- a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java +++ b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java @@ -234,11 +234,13 @@ public class ProgressMeter { * the progress itself. A separate printing daemon periodically polls the meter to print out * progress * - * @param loc Current location, can be null if you are at the end of the processing unit + * @param loc Current location, can be null if you are at the end of the processing unit. Must + * have size == 1 (cannot be multiple bases in size). * @param nTotalRecordsProcessed the total number of records we've processed */ public synchronized void notifyOfProgress(final GenomeLoc loc, final long nTotalRecordsProcessed) { if ( nTotalRecordsProcessed < 0 ) throw new IllegalArgumentException("nTotalRecordsProcessed must be >= 0"); + if ( loc.size() != 1 ) throw new IllegalArgumentException("GenomeLoc must have size == 1 but got " + loc); // weird comparison to ensure that loc == null (in unmapped reads) is keep before maxGenomeLoc == null (on startup) this.maxGenomeLoc = loc == null ? loc : (maxGenomeLoc == null ? loc : loc.max(maxGenomeLoc)); From 810e2da1d47b973f93be2102de2205310ea98a37 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 4 Jan 2013 11:38:58 -0500 Subject: [PATCH 15/30] Cleanup and unit tests for EventType and ReadRecalibrationInfo in BQSR -- Added unit tests for EventType and ReadRecalibrationInfo -- Simplified interface of EventType. Previously this enum carried an index with it, but this is redundant with the enum.ordinal function. Now just using that function instead. --- .../walkers/bqsr/ReadRecalibrationInfo.java | 7 +- .../walkers/bqsr/RecalibrationEngine.java | 2 +- .../recalibration/BaseRecalibration.java | 10 +- .../sting/utils/recalibration/EventType.java | 42 ++++--- .../utils/recalibration/ReadCovariates.java | 24 ++-- .../recalibration/RecalibrationReport.java | 6 +- .../bqsr/ReadRecalibrationInfoUnitTest.java | 110 ++++++++++++++++++ .../recalibration/EventTypeUnitTest.java | 61 ++++++++++ .../RecalibrationReportUnitTest.java | 8 +- 9 files changed, 216 insertions(+), 54 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfoUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/recalibration/EventTypeUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java index 121e3449b..b884b89db 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java @@ -58,7 +58,8 @@ public final class ReadRecalibrationInfo { if ( covariates == null ) throw new IllegalArgumentException("covariates cannot be null"); if ( skips == null ) throw new IllegalArgumentException("skips cannot be null"); if ( snpErrors == null ) throw new IllegalArgumentException("snpErrors cannot be null"); - // future: may allow insertionErrors && deletionErrors to be null, so don't enforce + if ( insertionErrors == null ) throw new IllegalArgumentException("insertionErrors cannot be null"); + if ( deletionErrors == null ) throw new IllegalArgumentException("deletionErrors cannot be null"); this.read = read; this.baseQuals = read.getBaseQualities(); @@ -73,8 +74,8 @@ public final class ReadRecalibrationInfo { if ( skips.length != length ) throw new IllegalArgumentException("skips.length " + snpErrors.length + " != length " + length); if ( snpErrors.length != length ) throw new IllegalArgumentException("snpErrors.length " + snpErrors.length + " != length " + length); - if ( insertionErrors != null && insertionErrors.length != length ) throw new IllegalArgumentException("insertionErrors.length " + snpErrors.length + " != length " + length); - if ( deletionErrors != null && deletionErrors.length != length ) throw new IllegalArgumentException("deletionErrors.length " + snpErrors.length + " != length " + length); + if ( insertionErrors.length != length ) throw new IllegalArgumentException("insertionErrors.length " + snpErrors.length + " != length " + length); + if ( deletionErrors.length != length ) throw new IllegalArgumentException("deletionErrors.length " + snpErrors.length + " != length " + length); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java index c6d5cddb9..910519031 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java @@ -124,7 +124,7 @@ public class RecalibrationEngine { for (final EventType eventType : EventType.values()) { final int[] keys = readCovariates.getKeySet(offset, eventType); - final int eventIndex = eventType.index; + final int eventIndex = eventType.ordinal(); final byte qual = recalInfo.getQual(eventType, offset); final double isError = recalInfo.getErrorFraction(eventType, offset); diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java index 3e0f36799..43c9bd2b5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -198,7 +198,7 @@ public class BaseRecalibration { } private double getGlobalDeltaQ(final int rgKey, final EventType errorModel) { - final Double cached = globalDeltaQs.get(rgKey, errorModel.index); + final Double cached = globalDeltaQs.get(rgKey, errorModel.ordinal()); if ( TEST_CACHING ) { final double calcd = calculateGlobalDeltaQ(rgKey, errorModel); @@ -210,7 +210,7 @@ public class BaseRecalibration { } private double getDeltaQReported(final int rgKey, final int qualKey, final EventType errorModel, final double globalDeltaQ) { - final Double cached = deltaQReporteds.get(rgKey, qualKey, errorModel.index); + final Double cached = deltaQReporteds.get(rgKey, qualKey, errorModel.ordinal()); if ( TEST_CACHING ) { final double calcd = calculateDeltaQReported(rgKey, qualKey, errorModel, globalDeltaQ, (byte)qualKey); @@ -240,7 +240,7 @@ public class BaseRecalibration { private double calculateGlobalDeltaQ(final int rgKey, final EventType errorModel) { double result = 0.0; - final RecalDatum empiricalQualRG = recalibrationTables.getReadGroupTable().get(rgKey, errorModel.index); + final RecalDatum empiricalQualRG = recalibrationTables.getReadGroupTable().get(rgKey, errorModel.ordinal()); if (empiricalQualRG != null) { final double globalDeltaQEmpirical = empiricalQualRG.getEmpiricalQuality(); @@ -254,7 +254,7 @@ public class BaseRecalibration { private double calculateDeltaQReported(final int rgKey, final int qualKey, final EventType errorModel, final double globalDeltaQ, final byte qualFromRead) { double result = 0.0; - final RecalDatum empiricalQualQS = recalibrationTables.getQualityScoreTable().get(rgKey, qualKey, errorModel.index); + final RecalDatum empiricalQualQS = recalibrationTables.getQualityScoreTable().get(rgKey, qualKey, errorModel.ordinal()); if (empiricalQualQS != null) { final double deltaQReportedEmpirical = empiricalQualQS.getEmpiricalQuality(); result = deltaQReportedEmpirical - qualFromRead - globalDeltaQ; @@ -287,7 +287,7 @@ public class BaseRecalibration { final double globalDeltaQ, final double deltaQReported, final byte qualFromRead) { - final RecalDatum empiricalQualCO = table.get(rgKey, qualKey, tableKey, errorModel.index); + final RecalDatum empiricalQualCO = table.get(rgKey, qualKey, tableKey, errorModel.ordinal()); if (empiricalQualCO != null) { final double deltaQCovariateEmpirical = empiricalQualCO.getEmpiricalQuality(); return deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported); diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/EventType.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/EventType.java index 1c84518eb..63f873892 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/EventType.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/EventType.java @@ -1,41 +1,39 @@ package org.broadinstitute.sting.utils.recalibration; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - public enum EventType { - BASE_SUBSTITUTION(0, "M", "Base Substitution"), - BASE_INSERTION(1, "I", "Base Insertion"), - BASE_DELETION(2, "D", "Base Deletion"); + BASE_SUBSTITUTION("M", "Base Substitution"), + BASE_INSERTION("I", "Base Insertion"), + BASE_DELETION("D", "Base Deletion"); - public final int index; private final String representation; private final String longRepresentation; - private EventType(int index, String representation, String longRepresentation) { - this.index = index; + private EventType(String representation, String longRepresentation) { this.representation = representation; this.longRepresentation = longRepresentation; } + /** + * Get the EventType corresponding to its ordinal index + * @param index an ordinal index + * @return the event type corresponding to ordinal index + */ public static EventType eventFrom(int index) { - switch (index) { - case 0: - return BASE_SUBSTITUTION; - case 1: - return BASE_INSERTION; - case 2: - return BASE_DELETION; - default: - throw new ReviewedStingException(String.format("Event %d does not exist.", index)); - } + return EventType.values()[index]; } - - public static EventType eventFrom(String event) { + + /** + * Get the EventType with short string representation + * @throws IllegalArgumentException if representation doesn't correspond to one of EventType + * @param representation short string representation of the event + * @return an EventType + */ + public static EventType eventFrom(String representation) { for (EventType eventType : EventType.values()) - if (eventType.representation.equals(event)) + if (eventType.representation.equals(representation)) return eventType; - throw new ReviewedStingException(String.format("Event %s does not exist.", event)); + throw new IllegalArgumentException(String.format("Event %s does not exist.", representation)); } @Override diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java index 4ddcb2b92..405b2d143 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java @@ -68,9 +68,9 @@ public class ReadCovariates { * @param readOffset the read offset, must be >= 0 and <= the read length used to create this ReadCovariates */ public void addCovariate(final int mismatch, final int insertion, final int deletion, final int readOffset) { - keys[EventType.BASE_SUBSTITUTION.index][readOffset][currentCovariateIndex] = mismatch; - keys[EventType.BASE_INSERTION.index][readOffset][currentCovariateIndex] = insertion; - keys[EventType.BASE_DELETION.index][readOffset][currentCovariateIndex] = deletion; + keys[EventType.BASE_SUBSTITUTION.ordinal()][readOffset][currentCovariateIndex] = mismatch; + keys[EventType.BASE_INSERTION.ordinal()][readOffset][currentCovariateIndex] = insertion; + keys[EventType.BASE_DELETION.ordinal()][readOffset][currentCovariateIndex] = deletion; } /** @@ -81,11 +81,11 @@ public class ReadCovariates { * @return */ public int[] getKeySet(final int readPosition, final EventType errorModel) { - return keys[errorModel.index][readPosition]; + return keys[errorModel.ordinal()][readPosition]; } public int[][] getKeySet(final EventType errorModel) { - return keys[errorModel.index]; + return keys[errorModel.ordinal()]; } // ---------------------------------------------------------------------- @@ -94,17 +94,9 @@ public class ReadCovariates { // // ---------------------------------------------------------------------- - protected int[][] getMismatchesKeySet() { - return keys[EventType.BASE_SUBSTITUTION.index]; - } - - protected int[][] getInsertionsKeySet() { - return keys[EventType.BASE_INSERTION.index]; - } - - protected int[][] getDeletionsKeySet() { - return keys[EventType.BASE_DELETION.index]; - } + protected int[][] getMismatchesKeySet() { return getKeySet(EventType.BASE_SUBSTITUTION); } + protected int[][] getInsertionsKeySet() { return getKeySet(EventType.BASE_INSERTION); } + protected int[][] getDeletionsKeySet() { return getKeySet(EventType.BASE_DELETION); } protected int[] getMismatchesKeySet(final int readPosition) { return getKeySet(readPosition, EventType.BASE_SUBSTITUTION); diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java index 6ecac1394..ff0890ff0 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java @@ -142,7 +142,7 @@ public class RecalibrationReport { tempCOVarray[2] = requestedCovariates[RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index + covIndex].keyFromValue(covValue); final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalUtils.EVENT_TYPE_COLUMN_NAME)); - tempCOVarray[3] = event.index; + tempCOVarray[3] = event.ordinal(); recalibrationTables.getTable(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index + covIndex).put(getRecalDatum(reportTable, i, false), tempCOVarray); } @@ -161,7 +161,7 @@ public class RecalibrationReport { final Object qual = reportTable.get(i, RecalUtils.QUALITY_SCORE_COLUMN_NAME); tempQUALarray[1] = requestedCovariates[1].keyFromValue(qual); final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalUtils.EVENT_TYPE_COLUMN_NAME)); - tempQUALarray[2] = event.index; + tempQUALarray[2] = event.ordinal(); qualTable.put(getRecalDatum(reportTable, i, false), tempQUALarray); } @@ -178,7 +178,7 @@ public class RecalibrationReport { final Object rg = reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME); tempRGarray[0] = requestedCovariates[0].keyFromValue(rg); final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalUtils.EVENT_TYPE_COLUMN_NAME)); - tempRGarray[1] = event.index; + tempRGarray[1] = event.ordinal(); rgTable.put(getRecalDatum(reportTable, i, true), tempRGarray); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfoUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfoUnitTest.java new file mode 100644 index 000000000..08a8f2dc1 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfoUnitTest.java @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import net.sf.samtools.SAMUtils; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.recalibration.EventType; +import org.broadinstitute.sting.utils.recalibration.ReadCovariates; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.EnumMap; +import java.util.List; + +public final class ReadRecalibrationInfoUnitTest extends BaseTest { + @DataProvider(name = "InfoProvider") + public Object[][] createCombineTablesProvider() { + List tests = new ArrayList(); + + for ( final int readLength: Arrays.asList(10, 100, 1000) ) { + for ( final boolean includeIndelErrors : Arrays.asList(true, false) ) { + tests.add(new Object[]{readLength, includeIndelErrors}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "InfoProvider") + public void testReadInfo(final int readLength, final boolean includeIndelErrors) { + final ReadCovariates covariates = new ReadCovariates(readLength, 2); + + final byte[] bases = new byte[readLength]; + final byte[] baseQuals = new byte[readLength]; + final byte[] insertionQuals = new byte[readLength]; + final byte[] deletionQuals = new byte[readLength]; + final boolean[] skips = new boolean[readLength]; + final double[] snpErrors = new double[readLength]; + final double[] insertionErrors = new double[readLength]; + final double[] deletionsErrors = new double[readLength]; + for ( int i = 0; i < readLength; i++ ) { + bases[i] = 'A'; + baseQuals[i] = (byte)(i % SAMUtils.MAX_PHRED_SCORE); + insertionQuals[i] = (byte)((i+1) % SAMUtils.MAX_PHRED_SCORE); + deletionQuals[i] = (byte)((i+2) % SAMUtils.MAX_PHRED_SCORE); + skips[i] = i % 2 == 0; + snpErrors[i] = 1.0 / (i+1); + insertionErrors[i] = 0.5 / (i+1); + deletionsErrors[i] = 0.3 / (i+1); + } + + final EnumMap errors = new EnumMap(EventType.class); + errors.put(EventType.BASE_SUBSTITUTION, snpErrors); + errors.put(EventType.BASE_INSERTION, insertionErrors); + errors.put(EventType.BASE_DELETION, deletionsErrors); + + final EnumMap quals = new EnumMap(EventType.class); + quals.put(EventType.BASE_SUBSTITUTION, baseQuals); + quals.put(EventType.BASE_INSERTION, insertionQuals); + quals.put(EventType.BASE_DELETION, deletionQuals); + + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, baseQuals, readLength + "M"); + if ( includeIndelErrors ) { + read.setBaseQualities(insertionQuals, EventType.BASE_INSERTION); + read.setBaseQualities(deletionQuals, EventType.BASE_DELETION); + } + + final ReadRecalibrationInfo info = new ReadRecalibrationInfo(read, covariates, skips, snpErrors, insertionErrors, deletionsErrors); + + Assert.assertEquals(info.getCovariatesValues(), covariates); + Assert.assertEquals(info.getRead(), read); + + for ( int i = 0; i < readLength; i++ ) { + Assert.assertEquals(info.skip(i), skips[i]); + for ( final EventType et : EventType.values() ) { + Assert.assertEquals(info.getErrorFraction(et, i), errors.get(et)[i]); + final byte expectedQual = et == EventType.BASE_SUBSTITUTION || includeIndelErrors ? quals.get(et)[i]: GATKSAMRecord.DEFAULT_INSERTION_DELETION_QUAL; + Assert.assertEquals(info.getQual(et, i), expectedQual); + } + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/EventTypeUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/EventTypeUnitTest.java new file mode 100644 index 000000000..53645e224 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/EventTypeUnitTest.java @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.recalibration; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.*; + +public final class EventTypeUnitTest extends BaseTest { + @Test + public void testEventTypes() { + for ( final EventType et : EventType.values() ) { + Assert.assertNotNull(et.toString()); + Assert.assertNotNull(et.prettyPrint()); + Assert.assertFalse("".equals(et.toString())); + Assert.assertFalse("".equals(et.prettyPrint())); + Assert.assertEquals(EventType.eventFrom(et.ordinal()), et); + Assert.assertEquals(EventType.eventFrom(et.toString()), et); + } + } + + @Test + public void testEventTypesEnumItself() { + final Set shortReps = new HashSet(); + for ( final EventType et : EventType.values() ) { + Assert.assertFalse(shortReps.contains(et.toString()), "Short representative for EventType has duplicates for " + et); + shortReps.add(et.toString()); + } + Assert.assertEquals(shortReps.size(), EventType.values().length, "Short representatives for EventType aren't unique"); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testBadString() { + EventType.eventFrom("asdfhalsdjfalkjsdf"); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java index 9707ed078..aa0419fed 100644 --- a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java @@ -20,7 +20,7 @@ public class RecalibrationReportUnitTest { private static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) { final Random random = new Random(); final int nObservations = random.nextInt(maxObservations); - final int nErrors = random.nextInt(maxErrors); + final int nErrors = Math.min(random.nextInt(maxErrors), nObservations); final int qual = random.nextInt(QualityUtils.MAX_QUAL_SCORE); return new RecalDatum(nObservations, nErrors, (byte)qual); } @@ -90,14 +90,14 @@ public class RecalibrationReportUnitTest { final int[] covariates = rc.getKeySet(offset, errorMode); final int randomMax = errorMode == EventType.BASE_SUBSTITUTION ? 10000 : 100000; - rgTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], errorMode.index); - qualTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], errorMode.index); + rgTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], errorMode.ordinal()); + qualTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], errorMode.ordinal()); nKeys += 2; for (int j = 0; j < optionalCovariates.size(); j++) { final NestedIntegerArray covTable = recalibrationTables.getTable(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index + j); final int covValue = covariates[RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index + j]; if ( covValue >= 0 ) { - covTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], covValue, errorMode.index); + covTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], covValue, errorMode.ordinal()); nKeys++; } } From fe06912a87c2a55f3f22b5760855afc9d4987afe Mon Sep 17 00:00:00 2001 From: Tad Jordan Date: Fri, 4 Jan 2013 11:52:04 -0500 Subject: [PATCH 16/30] Removed sorting by row from walkers --- .../sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java | 2 +- .../gatk/walkers/varianteval/VariantEvalReportWriter.java | 2 +- .../sting/utils/recalibration/QuantizationInfo.java | 4 ++-- .../broadinstitute/sting/gatk/report/GATKReportUnitTest.java | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java index 5972322f8..98e581e21 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java @@ -124,7 +124,7 @@ public class ErrorRatePerCycle extends LocusWalker { public void initialize() { report = new GATKReport(); - report.addTable(reportName, reportDescription, 6, GATKReportTable.TableSortingWay.SORT_BY_ROW); + report.addTable(reportName, reportDescription, 6); table = report.getTable(reportName); table.addColumn("readgroup"); table.addColumn("cycle"); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java index 6dad128fe..91efd1ffd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java @@ -162,7 +162,7 @@ public class VariantEvalReportWriter { // create the table final String tableName = ve.getSimpleName(); final String tableDesc = ve.getClass().getAnnotation(Analysis.class).description(); - report.addTable(tableName, tableDesc, 1 + stratifiers.size() + (scanner.hasMoltenField() ? 2 : datamap.size()), GATKReportTable.TableSortingWay.SORT_BY_ROW); + report.addTable(tableName, tableDesc, 1 + stratifiers.size() + (scanner.hasMoltenField() ? 2 : datamap.size())); // grab the table, and add the columns we need to it final GATKReportTable table = report.getTable(tableName); diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java index fc942499c..5cf16dc9f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java @@ -67,9 +67,9 @@ public class QuantizationInfo { return quantizationLevels; } - public GATKReportTable generateReportTable(boolean sortBycols) { + public GATKReportTable generateReportTable(boolean sortByCols) { GATKReportTable quantizedTable; - if(sortBycols) { + if(sortByCols) { quantizedTable = new GATKReportTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map", 3, GATKReportTable.TableSortingWay.SORT_BY_COLUMN); } else { quantizedTable = new GATKReportTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map", 3); diff --git a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java index 40d8d8ff9..0637e9a25 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java @@ -105,7 +105,7 @@ public class GATKReportUnitTest extends BaseTest { private boolean isSorted(GATKReportTable table) { boolean result = true; - File testingSortingTableFile = new File("myFile.txt"); + File testingSortingTableFile = new File("testSortingFile.txt"); try { // Connect print stream to the output stream From ab5526b3729cf481b96f1ca69cc399a1c396ca75 Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Fri, 4 Jan 2013 13:24:57 -0500 Subject: [PATCH 18/30] More TODOs --- .../sting/gatk/traversals/TraverseActiveRegionsUnitTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java index 5051bc35f..0ec4f57f6 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java @@ -126,10 +126,10 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { // TODO: reads which span many regions // TODO: reads which are partially between intervals (in/outside extension) // TODO: duplicate reads - + // TODO: read at the end of a contig // TODO: reads which are completely outside intervals but within extension // TODO: test the extension itself - + // TODO: unmapped reads intervals = new ArrayList(); intervals.add(genomeLocParser.createGenomeLoc("1", 10, 20)); From dd7f5e2be749b9373cb9ed7dd30b3c98ed34e9b0 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Fri, 4 Jan 2013 14:43:11 -0500 Subject: [PATCH 19/30] Hooking up the Bayesian estimate code for calculating Qemp in BQSR; various fixes after adding unit tests. --- .../walkers/bqsr/BQSRIntegrationTest.java | 32 ++--- .../bqsr/StandardRecalibrationEngine.java | 6 +- .../sting/utils/recalibration/RecalDatum.java | 112 ++++++++++-------- .../utils/recalibration/RecalDatumNode.java | 4 +- .../sting/utils/recalibration/RecalUtils.java | 2 +- .../recalibration/RecalibrationReport.java | 24 ++-- .../recalibration/RecalDatumUnitTest.java | 106 +++++++++++++++-- .../recalibration/RecalUtilsUnitTest.java | 2 +- .../RecalibrationReportUnitTest.java | 6 +- 9 files changed, 199 insertions(+), 95 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java index 177a989fb..9b6d6f913 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRIntegrationTest.java @@ -53,21 +53,21 @@ public class BQSRIntegrationTest extends WalkerTest { String HiSeqBam = privateTestDir + "HiSeq.1mb.1RG.bam"; String HiSeqInterval = "chr1:10,000,000-10,100,000"; return new Object[][]{ - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "2f250fecb930e0dfe0f63fe0fed3960b")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "26c8d7226139a040557b1d3b1c8792f0")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "9b43a1839cb6ea03aec1d96f15ca8efb")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "3159a9d136c45e4a65d46a23dc8fd3b5")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "bb7262829effbbdbc8d88dd36f480368")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "fbb002fa2b9197c4b555852dccc11562")}, - {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "7392acb71131a60a527ca32715fc59be")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "49d4383896a90795d94138db1410a7df")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "427448eff98cf194cc7217c0b1401e79")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "50cd1a10b6ecb3d09f90f1e4a66da95d")}, - {new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "1dc71561c9d0fb56f9876cb5043c5376")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "13e8f032e76340b114847c90af0a1f8a")}, - {new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "03f58ae4f9d203034e895a3636fc108f")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "49d4383896a90795d94138db1410a7df")}, - {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "2db2ef8c2d63e167663d70340182f49a")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, "", "6b3f252718f59cf9fd3f7612f73a35bf")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov ContextCovariate", "863576ac9ff0b0e02f2e84aef15923a7")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --no_standard_covs -cov CycleCovariate", "03e28f48201a35c70d1cf48e9f45364f")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --indels_context_size 4", "6e3c5635d387a1c428a7c9c88ad26488")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --low_quality_tail 5", "6507adcb94bacde4cdee9caa9f14f24b")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --quantizing_levels 6", "399bbb4bf80764dfc644b2f95d824615")}, + {new BQSRTest(hg18Reference, HiSeqBam, HiSeqInterval, " --mismatches_context_size 4", "34d70899253c2b3343ca9ae944291c30")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", "", "e61fa47bfc08433f0cd55558e2081548")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-10,200,000", "", "5c2622c63225b8b04990baf0ae4de07c")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA12873.454.SRP000031.2009_06.chr1.10_20mb.1RG.bam", "1:10,000,000-10,200,000", "", "ee7191d83d7d5bb957dc4595883c32f1")}, + {new BQSRTest(b36KGReference, validationDataLocation + "originalQuals.1kg.chr1.1-1K.1RG.bam", "1:1-1,000", " -OQ", "da92f4730356f479c2c2b71497cfac6d")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA19240.chr1.BFAST.SOLID.bam", "1:10,000,000-20,000,000", " --solid_recal_mode REMOVE_REF_BIAS", "8075595113b48c0c7ead08ce41bef9fe")}, + {new BQSRTest(b36KGReference, privateTestDir + "NA19240.chr1.BFAST.SOLID.hasCSNoCall.bam", "1:50,000-80,000", " --solid_nocall_strategy LEAVE_READ_UNRECALIBRATED", "be05834841c5690c66910270521d5c32")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:anyNameABCD,VCF " + privateTestDir + "vcfexample3.vcf", "e61fa47bfc08433f0cd55558e2081548")}, + {new BQSRTest(b36KGReference, validationDataLocation + "NA12892.SLX.SRP000031.2009_06.selected.1Mb.1RG.bam", "1:10,000,000-10,200,000", " -knownSites:bed " + validationDataLocation + "bqsrKnownTest.bed", "8ee0b498dbbc95ce76393a0f089fec92")}, }; } @@ -104,7 +104,7 @@ public class BQSRIntegrationTest extends WalkerTest { " -sortAllCols" + " --plot_pdf_file /dev/null" + " --intermediate_csv_file %s", - Arrays.asList("d1c38a3418979400630e2bca1140689c")); + Arrays.asList("dd6e0e1e3f53f8ae0c8f5de21ded6ee9")); executeTest("testBQSR-CSVfile", spec); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java index a6ab98e8b..4ad8ccdf3 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java @@ -79,7 +79,7 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP * @return a new RecalDatum object with the observation and the error */ protected RecalDatum createDatumObject(final byte reportedQual, final double isError) { - return new RecalDatum(1, isError, reportedQual); + return new RecalDatum(1L, isError, reportedQual); } /** @@ -133,12 +133,12 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP if ( ! table.put(createDatumObject(qual, isError), keys) ) { // Failed to put a new item because another thread came along and put an item here first. // Get the newly-put item and increment it (item is guaranteed to exist at this point) - table.get(keys).increment(1.0, isError); + table.get(keys).increment(1, isError); } } else { // Easy case: already an item here, so increment it - existingDatum.increment(1.0, isError); + existingDatum.increment(1, isError); } } } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java index 9b58a5900..4eb069542 100755 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java @@ -68,7 +68,7 @@ public class RecalDatum { /** * number of bases seen in total */ - private double numObservations; + private long numObservations; /** * number of bases seen that didn't match the reference @@ -89,13 +89,13 @@ public class RecalDatum { /** * Create a new RecalDatum with given observation and mismatch counts, and an reported quality * - * @param _numObservations - * @param _numMismatches - * @param reportedQuality + * @param _numObservations observations + * @param _numMismatches mismatches + * @param reportedQuality Qreported */ - public RecalDatum(final double _numObservations, final double _numMismatches, final byte reportedQuality) { + public RecalDatum(final long _numObservations, final double _numMismatches, final byte reportedQuality) { if ( _numObservations < 0 ) throw new IllegalArgumentException("numObservations < 0"); - if ( _numMismatches < 0 ) throw new IllegalArgumentException("numMismatches < 0"); + if ( _numMismatches < 0.0 ) throw new IllegalArgumentException("numMismatches < 0"); if ( reportedQuality < 0 ) throw new IllegalArgumentException("reportedQuality < 0"); numObservations = _numObservations; @@ -106,7 +106,7 @@ public class RecalDatum { /** * Copy copy into this recal datum, overwriting all of this objects data - * @param copy + * @param copy RecalDatum to copy */ public RecalDatum(final RecalDatum copy) { this.numObservations = copy.getNumObservations(); @@ -119,7 +119,7 @@ public class RecalDatum { * Add in all of the data from other into this object, updating the reported quality from the expected * error rate implied by the two reported qualities * - * @param other + * @param other RecalDatum to combine */ public synchronized void combine(final RecalDatum other) { final double sumErrors = this.calcExpectedErrors() + other.calcExpectedErrors(); @@ -192,7 +192,7 @@ public class RecalDatum { @Override public String toString() { - return String.format("%.2f,%.2f,%.2f", getNumObservations(), getNumMismatches(), getEmpiricalQuality()); + return String.format("%d,%.2f,%.2f", getNumObservations(), getNumMismatches(), getEmpiricalQuality()); } public String stringForCSV() { @@ -205,11 +205,11 @@ public class RecalDatum { // //--------------------------------------------------------------------------------------------------------------- - public final double getNumObservations() { + public final long getNumObservations() { return numObservations; } - public final synchronized void setNumObservations(final double numObservations) { + public final synchronized void setNumObservations(final long numObservations) { if ( numObservations < 0 ) throw new IllegalArgumentException("numObservations < 0"); this.numObservations = numObservations; empiricalQuality = UNINITIALIZED; @@ -227,7 +227,7 @@ public class RecalDatum { } @Requires({"by >= 0"}) - public final synchronized void incrementNumObservations(final double by) { + public final synchronized void incrementNumObservations(final long by) { numObservations += by; empiricalQuality = UNINITIALIZED; } @@ -240,7 +240,7 @@ public class RecalDatum { @Requires({"incObservations >= 0", "incMismatches >= 0"}) @Ensures({"numObservations == old(numObservations) + incObservations", "numMismatches == old(numMismatches) + incMismatches"}) - public final synchronized void increment(final double incObservations, final double incMismatches) { + public final synchronized void increment(final long incObservations, final double incMismatches) { numObservations += incObservations; numMismatches += incMismatches; empiricalQuality = UNINITIALIZED; @@ -248,7 +248,7 @@ public class RecalDatum { @Ensures({"numObservations == old(numObservations) + 1", "numMismatches >= old(numMismatches)"}) public final synchronized void increment(final boolean isError) { - increment(1, isError ? 1 : 0.0); + increment(1, isError ? 1.0 : 0.0); } // ------------------------------------------------------------------------------------- @@ -257,19 +257,6 @@ public class RecalDatum { // // ------------------------------------------------------------------------------------- - /** - * Calculate and cache the empirical quality score from mismatches and observations (expensive operation) - */ - @Requires("empiricalQuality == UNINITIALIZED") - @Ensures("empiricalQuality != UNINITIALIZED") - private synchronized void calcEmpiricalQuality() { - - // TODO -- add code for Bayesian estimate of Qemp here - - final double empiricalQual = -10 * Math.log10(getEmpiricalErrorRate()); - empiricalQuality = Math.min(empiricalQual, (double) QualityUtils.MAX_RECALIBRATED_Q_SCORE); - } - /** * calculate the expected number of errors given the estimated Q reported and the number of observations * in this datum. @@ -281,10 +268,29 @@ public class RecalDatum { return getNumObservations() * QualityUtils.qualToErrorProb(estimatedQReported); } - static final boolean DEBUG = false; - static final double RESOLUTION_BINS_PER_QUAL = 1.0; + /** + * Calculate and cache the empirical quality score from mismatches and observations (expensive operation) + */ + @Requires("empiricalQuality == UNINITIALIZED") + @Ensures("empiricalQuality != UNINITIALIZED") + private synchronized void calcEmpiricalQuality() { - static public double bayesianEstimateOfEmpiricalQuality(final double nObservations, final double nErrors, final double QReported) { + // smoothing is one error and one non-error observation + final long mismatches = (long)(getNumMismatches() + 0.5) + SMOOTHING_CONSTANT; + final long observations = getNumObservations() + SMOOTHING_CONSTANT + SMOOTHING_CONSTANT; + + final double empiricalQual = RecalDatum.bayesianEstimateOfEmpiricalQuality(observations, mismatches, getEstimatedQReported()); + + // This is the old and busted point estimate approach: + //final double empiricalQual = -10 * Math.log10(getEmpiricalErrorRate()); + + empiricalQuality = Math.min(empiricalQual, (double) QualityUtils.MAX_RECALIBRATED_Q_SCORE); + } + + //static final boolean DEBUG = false; + static private final double RESOLUTION_BINS_PER_QUAL = 1.0; + + static public double bayesianEstimateOfEmpiricalQuality(final long nObservations, final long nErrors, final double QReported) { final int numBins = (QualityUtils.MAX_REASONABLE_Q_SCORE + 1) * (int)RESOLUTION_BINS_PER_QUAL; @@ -294,14 +300,14 @@ public class RecalDatum { final double QEmpOfBin = bin / RESOLUTION_BINS_PER_QUAL; - log10Posteriors[bin] = log10QempPrior(QEmpOfBin, QReported) + log10Likelihood(QEmpOfBin, nObservations, nErrors); + log10Posteriors[bin] = log10QempPrior(QEmpOfBin, QReported) + log10QempLikelihood(QEmpOfBin, nObservations, nErrors); - if ( DEBUG ) - System.out.println(String.format("bin = %d, Qreported = %f, nObservations = %f, nErrors = %f, posteriors = %f", bin, QReported, nObservations, nErrors, log10Posteriors[bin])); + //if ( DEBUG ) + // System.out.println(String.format("bin = %d, Qreported = %f, nObservations = %f, nErrors = %f, posteriors = %f", bin, QReported, nObservations, nErrors, log10Posteriors[bin])); } - if ( DEBUG ) - System.out.println(String.format("Qreported = %f, nObservations = %f, nErrors = %f", QReported, nObservations, nErrors)); + //if ( DEBUG ) + // System.out.println(String.format("Qreported = %f, nObservations = %f, nErrors = %f", QReported, nObservations, nErrors)); final double[] normalizedPosteriors = MathUtils.normalizeFromLog10(log10Posteriors); final int MLEbin = MathUtils.maxElementIndex(normalizedPosteriors); @@ -310,35 +316,47 @@ public class RecalDatum { return Qemp; } - static final double[] log10QempPriorCache = new double[QualityUtils.MAX_GATK_USABLE_Q_SCORE + 1]; + static private final double[] log10QempPriorCache = new double[QualityUtils.MAX_GATK_USABLE_Q_SCORE + 1]; static { // f(x) = a + b*exp(-((x - c)^2 / (2*d^2))) // Note that b is the height of the curve's peak, c is the position of the center of the peak, and d controls the width of the "bell". final double GF_a = 0.0; final double GF_b = 0.9; final double GF_c = 0.0; - final double GF_d = 0.5; + final double GF_d = 0.5; // with these parameters, deltas can shift at most ~20 Q points final GaussianFunction gaussian = new GaussianFunction(GF_a, GF_b, GF_c, GF_d); - for ( int i = 0; i <= QualityUtils.MAX_GATK_USABLE_Q_SCORE; i++ ) - log10QempPriorCache[i] = Math.log10(gaussian.value((double) i)); + for ( int i = 0; i <= QualityUtils.MAX_GATK_USABLE_Q_SCORE; i++ ) { + double log10Prior = Math.log10(gaussian.value((double) i)); + if ( Double.isInfinite(log10Prior) ) + log10Prior = -Double.MAX_VALUE; + log10QempPriorCache[i] = log10Prior; + } } - static public double log10QempPrior(final double Qempirical, final double Qreported) { + static protected double log10QempPrior(final double Qempirical, final double Qreported) { final int difference = Math.min(Math.abs((int) (Qempirical - Qreported)), QualityUtils.MAX_GATK_USABLE_Q_SCORE); - if ( DEBUG ) - System.out.println(String.format("Qemp = %f, log10Priors = %f", Qempirical, log10QempPriorCache[difference])); + //if ( DEBUG ) + // System.out.println(String.format("Qemp = %f, log10Priors = %f", Qempirical, log10QempPriorCache[difference])); return log10QempPriorCache[difference]; } - static public double log10Likelihood(final double Qempirical, final double nObservations, final double nErrors) { + static protected double log10QempLikelihood(final double Qempirical, final long nObservations, final long nErrors) { + if ( nObservations == 0 ) + return 0.0; + // this is just a straight binomial PDF - double log10Prob = MathUtils.log10BinomialProbability((int)nObservations, (int)nErrors, QualityUtils.qualToErrorProbLog10((byte)(int)Qempirical)); - if ( log10Prob == Double.NEGATIVE_INFINITY ) + double log10Prob = MathUtils.log10BinomialProbability(longToInt(nObservations), longToInt(nErrors), QualityUtils.qualToErrorProbLog10((byte)(int)Qempirical)); + if ( Double.isInfinite(log10Prob) || Double.isNaN(log10Prob) ) log10Prob = -Double.MAX_VALUE; - if ( DEBUG ) - System.out.println(String.format("Qemp = %f, log10Likelihood = %f", Qempirical, log10Prob)); + //if ( DEBUG ) + // System.out.println(String.format("Qemp = %f, log10Likelihood = %f", Qempirical, log10Prob)); + return log10Prob; } + + static protected int longToInt(final long l) { + return (l > Integer.MAX_VALUE) ? Integer.MAX_VALUE : (int)l; + } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java index 6c94c3c42..b8f89ad66 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatumNode.java @@ -264,7 +264,7 @@ public class RecalDatumNode { for ( final RecalDatumNode subnode : subnodes ) { // use the yates correction to help avoid all zeros => NaN counts[i][0] = Math.round(subnode.getRecalDatum().getNumMismatches()) + 1L; - counts[i][1] = Math.round(subnode.getRecalDatum().getNumObservations()) + 2L; + counts[i][1] = subnode.getRecalDatum().getNumObservations() + 2L; i++; } @@ -320,7 +320,7 @@ public class RecalDatumNode { if ( isLeaf() ) { // this is leave node - return (Math.abs(Math.log10(recalDatum.getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * recalDatum.getNumObservations(); + return (Math.abs(Math.log10(recalDatum.getEmpiricalErrorRate()) - Math.log10(globalErrorRate))) * (double)recalDatum.getNumObservations(); // TODO -- how we can generalize this calculation? // if ( this.qEnd <= minInterestingQual ) // // It's free to merge up quality scores below the smallest interesting one diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java index d4e781fdd..9a9a5dfc6 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java @@ -93,7 +93,7 @@ public class RecalUtils { private static final Pair eventType = new Pair(RecalUtils.EVENT_TYPE_COLUMN_NAME, "%s"); private static final Pair empiricalQuality = new Pair(RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME, "%.4f"); private static final Pair estimatedQReported = new Pair(RecalUtils.ESTIMATED_Q_REPORTED_COLUMN_NAME, "%.4f"); - private static final Pair nObservations = new Pair(RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, "%.2f"); + private static final Pair nObservations = new Pair(RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME, "%d"); private static final Pair nErrors = new Pair(RecalUtils.NUMBER_ERRORS_COLUMN_NAME, "%.2f"); /** diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java index 6ecac1394..ac451221e 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java @@ -68,15 +68,6 @@ public class RecalibrationReport { } - protected RecalibrationReport(final QuantizationInfo quantizationInfo, final RecalibrationTables recalibrationTables, final Covariate[] requestedCovariates, final GATKReportTable argumentTable, final RecalibrationArgumentCollection RAC) { - this.quantizationInfo = quantizationInfo; - this.recalibrationTables = recalibrationTables; - this.requestedCovariates = requestedCovariates; - this.argumentTable = argumentTable; - this.RAC = RAC; - this.optionalCovariateIndexes = null; - } - /** * Counts the number of unique read groups in the table * @@ -192,11 +183,22 @@ public class RecalibrationReport { else if ( o instanceof Long ) return (Long)o; else - throw new ReviewedStingException("Object " + o + " is expected to be either a double, long or integer but its not either: " + o.getClass()); + throw new ReviewedStingException("Object " + o + " is expected to be either a double, long or integer but it's not either: " + o.getClass()); + } + + private long asLong(final Object o) { + if ( o instanceof Long ) + return (Long)o; + else if ( o instanceof Integer ) + return ((Integer)o).longValue(); + else if ( o instanceof Double ) + return ((Double)o).longValue(); + else + throw new ReviewedStingException("Object " + o + " is expected to be a long but it's not: " + o.getClass()); } private RecalDatum getRecalDatum(final GATKReportTable reportTable, final int row, final boolean hasEstimatedQReportedColumn) { - final double nObservations = asDouble(reportTable.get(row, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME)); + final long nObservations = asLong(reportTable.get(row, RecalUtils.NUMBER_OBSERVATIONS_COLUMN_NAME)); final double nErrors = asDouble(reportTable.get(row, RecalUtils.NUMBER_ERRORS_COLUMN_NAME)); final double empiricalQuality = asDouble(reportTable.get(row, RecalUtils.EMPIRICAL_QUALITY_COLUMN_NAME)); diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java index 0f0c74362..2305fe566 100644 --- a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java @@ -30,16 +30,13 @@ package org.broadinstitute.sting.utils.recalibration; import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.Utils; import org.testng.Assert; -import org.testng.annotations.BeforeSuite; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; -import java.util.ArrayList; import java.util.Arrays; -import java.util.List; public class RecalDatumUnitTest extends BaseTest { @@ -74,7 +71,7 @@ public class RecalDatumUnitTest extends BaseTest { } public RecalDatum makeRecalDatum() { - return new RecalDatum(exTotal, exError, (byte)getReportedQual()); + return new RecalDatum((long)exTotal, (double)exError, (byte)getReportedQual()); } @Override @@ -83,13 +80,19 @@ public class RecalDatumUnitTest extends BaseTest { } } + private static boolean createdDatumTestProviders = false; + @DataProvider(name = "RecalDatumTestProvider") public Object[][] makeRecalDatumTestProvider() { - for ( int E : Arrays.asList(1, 10, 100, 1000, 10000) ) - for ( int N : Arrays.asList(10, 100, 1000, 10000, 100000, 1000000) ) - for ( int reportedQual : Arrays.asList(10, 20) ) - if ( E <= N ) - new RecalDatumTestProvider(E, N, reportedQual); + if ( !createdDatumTestProviders ) { + for ( int E : Arrays.asList(1, 10, 100, 1000, 10000) ) + for ( int N : Arrays.asList(10, 100, 1000, 10000, 100000, 1000000) ) + for ( int reportedQual : Arrays.asList(10, 20) ) + if ( E <= N ) + new RecalDatumTestProvider(E, N, reportedQual); + createdDatumTestProviders = true; + } + return RecalDatumTestProvider.getTests(RecalDatumTestProvider.class); } @@ -104,7 +107,6 @@ public class RecalDatumUnitTest extends BaseTest { Assert.assertEquals(datum.getNumObservations(), cfg.exTotal, 1E-6); if ( cfg.getReportedQual() != -1 ) Assert.assertEquals(datum.getEstimatedQReportedAsByte(), cfg.getReportedQual()); - BaseTest.assertEqualsDoubleSmart(datum.getEmpiricalQuality(), cfg.getErrorRatePhredScaled()); BaseTest.assertEqualsDoubleSmart(datum.getEmpiricalErrorRate(), cfg.getErrorRate()); final double e = datum.getEmpiricalQuality(); @@ -175,7 +177,87 @@ public class RecalDatumUnitTest extends BaseTest { @Test public void testNoObs() { - final RecalDatum rd = new RecalDatum(0, 0, (byte)10); + final RecalDatum rd = new RecalDatum(0L, 0.0, (byte)10); Assert.assertEquals(rd.getEmpiricalErrorRate(), 0.0); } + + @Test + public void testlog10QempPrior() { + for ( int Qemp = 0; Qemp <= QualityUtils.MAX_QUAL_SCORE; Qemp++ ) { + for ( int Qrep = 0; Qrep <= QualityUtils.MAX_QUAL_SCORE; Qrep++ ) { + final double log10prior = RecalDatum.log10QempPrior(Qemp, Qrep); + Assert.assertTrue(log10prior < 0.0); + Assert.assertFalse(Double.isInfinite(log10prior)); + Assert.assertFalse(Double.isNaN(log10prior)); + } + } + + final int Qrep = 20; + int maxQemp = -1; + double maxQempValue = -Double.MAX_VALUE; + for ( int Qemp = 0; Qemp <= QualityUtils.MAX_QUAL_SCORE; Qemp++ ) { + final double log10prior = RecalDatum.log10QempPrior(Qemp, Qrep); + if ( log10prior > maxQempValue ) { + maxQemp = Qemp; + maxQempValue = log10prior; + } + } + Assert.assertEquals(maxQemp, Qrep); + } + + @Test + public void testlog10QempLikelihood() { + + final int Qrep = 20; + + // test no shift + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(0, 0, Qrep), (double)Qrep); + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10, 0, Qrep), (double)Qrep); + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(1000, 10, Qrep), (double)Qrep); + + // test small shift + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10, 10, Qrep), Qrep - 1.0); + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(1000, 0, Qrep), Qrep + 1.0); + + // test medium shift + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10000, 0, Qrep), Qrep + 3.0); + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(10000, 10, Qrep), Qrep + 3.0); + + // test large shift + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(100000, 10, Qrep), Qrep + 8.0); + Assert.assertEquals(RecalDatum.bayesianEstimateOfEmpiricalQuality(1000000, 10, Qrep), Qrep + 16.0); + } + + @Test + public void testBayesianEstimateOfEmpiricalQuality() { + + final double[] Qemps = new double[] { 0.0, 10.0, 20.0, 30.0 }; + final int[] observations = new int[] {0, 10, 1000, 1000000}; + final int[] errors = new int[] {0, 10, 1000, 1000000}; + + for ( double Qemp : Qemps ) { + for ( int observation : observations ) { + for ( int error : errors ) { + if ( error > observation ) + continue; + + final double log10likelihood = RecalDatum.log10QempLikelihood(Qemp, observation, error); + Assert.assertTrue(observation == 0 ? MathUtils.compareDoubles(log10likelihood, 0.0) == 0 : log10likelihood < 0.0); + Assert.assertFalse(Double.isInfinite(log10likelihood)); + Assert.assertFalse(Double.isNaN(log10likelihood)); + } + } + } + } + + @Test + public void testLongToInt() { + long l = new Long((long)Integer.MAX_VALUE); + int i = RecalDatum.longToInt(l); + Assert.assertEquals(i, Integer.MAX_VALUE); + + l++; + i = RecalDatum.longToInt(l); + Assert.assertEquals(i, Integer.MAX_VALUE); + } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalUtilsUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalUtilsUnitTest.java index 500a41e74..6c3b18a92 100644 --- a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalUtilsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalUtilsUnitTest.java @@ -146,7 +146,7 @@ public final class RecalUtilsUnitTest extends BaseTest { public NestedIntegerArray makeTable(final List rows) { final NestedIntegerArray x = new NestedIntegerArray(3, 3); for ( final Row r : rows ) - x.put(new RecalDatum(r.no, r.ne, (byte)10), r.rg, r.qual); + x.put(new RecalDatum((long)r.no, (double)r.ne, (byte)10), r.rg, r.qual); return x; } } diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java index 9707ed078..43c9245d7 100644 --- a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java @@ -20,9 +20,11 @@ public class RecalibrationReportUnitTest { private static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) { final Random random = new Random(); final int nObservations = random.nextInt(maxObservations); - final int nErrors = random.nextInt(maxErrors); + int nErrors = random.nextInt(maxErrors); + while ( nErrors > nObservations ) + nErrors = random.nextInt(maxErrors); final int qual = random.nextInt(QualityUtils.MAX_QUAL_SCORE); - return new RecalDatum(nObservations, nErrors, (byte)qual); + return new RecalDatum((long)nObservations, (double)nErrors, (byte)qual); } @Test(enabled = true) From 41bc416b65794fd78eefd72161e07a76a55c6114 Mon Sep 17 00:00:00 2001 From: Chris Hartl Date: Fri, 4 Jan 2013 16:46:14 -0500 Subject: [PATCH 20/30] Remove AAL and update MD5s. --- ...GenotyperGeneralPloidyIntegrationTest.java | 14 ++-- .../UnifiedGenotyperIntegrationTest.java | 76 +++++++++---------- .../HaplotypeCallerIntegrationTest.java | 18 ++--- .../gatk/walkers/annotator/QualByDepth.java | 42 ++-------- .../VariantAnnotatorIntegrationTest.java | 26 +++---- 5 files changed, 72 insertions(+), 104 deletions(-) diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java index 5cdc15e5e..54dcbb395 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperGeneralPloidyIntegrationTest.java @@ -55,36 +55,36 @@ public class UnifiedGenotyperGeneralPloidyIntegrationTest extends WalkerTest { @Test(enabled = true) public void testSNP_ACS_Pools() { - PC_LSV_Test_short(" -maxAltAlleles 1 -ploidy 6 -out_mode EMIT_ALL_CONFIDENT_SITES","LSV_SNP_ACS","SNP","651469eeacdb3ab9e2690cfb71f6a634"); + PC_LSV_Test_short(" -maxAltAlleles 1 -ploidy 6 -out_mode EMIT_ALL_CONFIDENT_SITES","LSV_SNP_ACS","SNP","df0e67c975ef74d593f1c704daab1705"); } @Test(enabled = true) public void testBOTH_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","be7dc20bdb5f200d189706bcf1aeb7ee"); + PC_LSV_Test(String.format(" -maxAltAlleles 2 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_BOTH_GGA","BOTH","d1c113a17e36762d27eb27fd12528e52"); } @Test(enabled = true) public void testINDEL_GGA_Pools() { - PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","25e5ea86d87b7d7ddaad834a6ed7481d"); + PC_LSV_Test(String.format(" -maxAltAlleles 1 -ploidy 24 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",LSV_ALLELES),"LSV_INDEL_GGA","INDEL","ab043eed87fadbe5761a55a4912b19ac"); } @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy3_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","cdbf268d282e57189a88fb83f0e1fd72"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 3","LSV_INDEL_DISC_NOREF_p3","INDEL","95d48e0680019d5406ff9adb8f2ff3ca"); } @Test(enabled = true) public void testINDEL_maxAltAlleles2_ploidy1_Pools_noRef() { - PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","2ed40925cd112c1a45470d215b7ec4b3"); + PC_LSV_Test_NoRef(" -maxAltAlleles 2 -ploidy 1","LSV_INDEL_DISC_NOREF_p1","INDEL","8a4ddd64c4e9c42b4a8622582fcfa9c9"); } @Test(enabled = true) public void testMT_SNP_DISCOVERY_sp4() { - PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","33695a998bcc906cabcc758727004387"); + PC_MT_Test(CEUTRIO_BAM, " -maxAltAlleles 1 -ploidy 8", "MT_SNP_DISCOVERY_sp4","3fc6f4d458313616727c60e49c0e852b"); } @Test(enabled = true) public void testMT_SNP_GGA_sp10() { - PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "b2725242114bf9cc9bca14679705ba40"); + PC_MT_Test(CEUTRIO_BAM, String.format(" -maxAltAlleles 1 -ploidy 20 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles %s",NA12891_CALLS), "MT_SNP_GGA_sp10", "1bebbc0f28bff6fd64736ccca8839df8"); } } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java index e40a7ed38..8aaeccc55 100755 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedGenotyperIntegrationTest.java @@ -30,7 +30,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSamplePilot1() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10,022,000-10,025,000", 1, - Arrays.asList("2ba9af34d2a4d55caf152265a30ead46")); + Arrays.asList("847605f4efafef89529fe0e496315edd")); executeTest("test MultiSample Pilot1", spec); } @@ -38,7 +38,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("0630c35c070d7a7e0cf22b3cce797f22")); + Arrays.asList("5b31b811072a4df04524e13604015f9b")); executeTest("test MultiSample Pilot2 with alleles passed in", spec1); } @@ -46,7 +46,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testWithAllelesPassedIn2() { WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommand + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "allelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,025,000", 1, - Arrays.asList("5857dcb4e6a8422ae0813e42d433b122")); + Arrays.asList("d9992e55381afb43742cc9b30fcd7538")); executeTest("test MultiSample Pilot2 with alleles passed in and emitting all sites", spec2); } @@ -54,7 +54,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSingleSamplePilot2() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,100,000", 1, - Arrays.asList("489deda5d3276545364a06b7385f8bd9")); + Arrays.asList("dff4412a074940d26994f9552476b209")); executeTest("test SingleSample Pilot2", spec); } @@ -62,7 +62,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultipleSNPAlleles() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH --dbsnp " + b37dbSNP129 + " -I " + privateTestDir + "multiallelic.snps.bam -o %s -L " + privateTestDir + "multiallelic.snps.intervals", 1, - Arrays.asList("595ba44c75d08dab98df222b8e61ab70")); + Arrays.asList("b41b95aaa2c453c9b75b3b29a9c2718e")); executeTest("test Multiple SNP alleles", spec); } @@ -70,7 +70,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testBadRead() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm BOTH -I " + privateTestDir + "badRead.test.bam -o %s -L 1:22753424-22753464", 1, - Arrays.asList("360f9795facdaa14c0cb4b05207142e4")); + Arrays.asList("d915535c1458733f09f82670092fcab6")); executeTest("test bad read", spec); } @@ -78,7 +78,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReverseTrim() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + validationDataLocation + "CEUTrio.HiSeq.b37.chr20.10_11mb.bam -o %s -L 20:10289124 -L 20:10090289", 1, - Arrays.asList("4b4a62429f8eac1e2f27ba5e2edea9e5")); + Arrays.asList("44e9f6cf11b4efecb454cd3de8de9877")); executeTest("test reverse trim", spec); } @@ -86,7 +86,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMismatchedPLs() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -glm INDEL -I " + privateTestDir + "mismatchedPLs.bam -o %s -L 1:24020341", 1, - Arrays.asList("cc892c91a93dbd8dbdf645803f35a0ee")); + Arrays.asList("935ee705ffe8cc6bf1d9efcceea271c8")); executeTest("test mismatched PLs", spec); } @@ -96,7 +96,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // // -------------------------------------------------------------------------------------------------------------- - private final static String COMPRESSED_OUTPUT_MD5 = "3fc7d2681ff753e2d68605d7cf8b63e3"; + private final static String COMPRESSED_OUTPUT_MD5 = "e6e33f0ebabab027eabed51fe9a08da9"; @Test public void testCompressedOutput() { @@ -149,7 +149,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinBaseQualityScore() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --min_base_quality_score 26", 1, - Arrays.asList("04dc83d7dfb42b8cada91647bd9f32f1")); + Arrays.asList("6ee6537e9ebc1bfc7c6cf8f04b1582ff")); executeTest("test min_base_quality_score 26", spec); } @@ -157,7 +157,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testSLOD() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " --computeSLOD --no_cmdline_in_header -glm BOTH --dbsnp " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("4429a665a1048f958db3c204297cdb9f")); + Arrays.asList("55760482335497086458b09e415ecf54")); executeTest("test SLOD", spec); } @@ -165,7 +165,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testNDA() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " --annotateNDA -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("f063e3573c513eaa9ce7d7df22143362")); + Arrays.asList("938e888a40182878be4c3cc4859adb69")); executeTest("test NDA", spec); } @@ -173,7 +173,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testCompTrack() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b36KGReference + " --no_cmdline_in_header -glm BOTH -comp:FOO " + b36dbSNP129 + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000", 1, - Arrays.asList("d76e93e2676354dde832f08a508c6f88")); + Arrays.asList("7dc186d420487e4e156a24ec8dea0951")); executeTest("test using comp track", spec); } @@ -187,17 +187,17 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { @Test public void testOutputParameterSitesOnly() { - testOutputParameters("-sites_only", "1a65172b9bd7a2023d48bc758747b34a"); + testOutputParameters("-sites_only", "f99c7471127a6fb6f72e136bc873b2c9"); } @Test public void testOutputParameterAllConfident() { - testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "3f1fa34d8440f6f21654ce60c0ba8f28"); + testOutputParameters("--output_mode EMIT_ALL_CONFIDENT_SITES", "9dbc9389db39cf9697e93e0bf529314f"); } @Test public void testOutputParameterAllSites() { - testOutputParameters("--output_mode EMIT_ALL_SITES", "f240434b4d3c234f6f9e349e9ec05f4e"); + testOutputParameters("--output_mode EMIT_ALL_SITES", "8b26088a035e579c4afd3b46737291e4"); } private void testOutputParameters(final String args, final String md5) { @@ -211,7 +211,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testConfidence() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 -stand_call_conf 10 ", 1, - Arrays.asList("aec378bed312b3557c6dd7ec740c8091")); + Arrays.asList("4af83a883ecc03a23b0aa6dd4b8f1ceb")); executeTest("test confidence 1", spec1); } @@ -222,12 +222,12 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { // -------------------------------------------------------------------------------------------------------------- @Test public void testHeterozyosity1() { - testHeterozosity( 0.01, "5da6b24033a6b02f466836443d49560e" ); + testHeterozosity( 0.01, "bdc8760d7ae1e01c0510b12c1e6fcfa3" ); } @Test public void testHeterozyosity2() { - testHeterozosity( 1.0 / 1850, "1f284c4af967a3c26687164f9441fb16" ); + testHeterozosity( 1.0 / 1850, "f508f06a47305e11e62776615cb14fe3" ); } private void testHeterozosity(final double arg, final String md5) { @@ -251,7 +251,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("cff553c53de970f64051ed5711407038")); + Arrays.asList("13d91059f58fb50a07a6a34b9438a45b")); executeTest(String.format("test multiple technologies"), spec); } @@ -270,7 +270,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -L 1:10,000,000-10,100,000" + " -baq CALCULATE_AS_NECESSARY", 1, - Arrays.asList("f960a91963e614a6c8d8cda57836df24")); + Arrays.asList("07d8b77a5f6697f3a47a4f1efb0dcf50")); executeTest(String.format("test calling with BAQ"), spec); } @@ -289,7 +289,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("46a6d24c82ebb99d305462960fa09b7c")); + Arrays.asList("0f026d2e568172cf32813cc54ea7ba23")); executeTest(String.format("test indel caller in SLX"), spec); } @@ -304,7 +304,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -minIndelCnt 1" + " -L 1:10,000,000-10,100,000", 1, - Arrays.asList("2be25321bbc6a963dba7ecba5dd76802")); + Arrays.asList("e7ad858e9d6617534761918561f3ed4c")); executeTest(String.format("test indel caller in SLX with low min allele count"), spec); } @@ -317,7 +317,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 1:10,000,000-10,500,000", 1, - Arrays.asList("d6b2657cd5a4a949968cdab50efce515")); + Arrays.asList("39c7a813fd6ee82d3604f2a868b35b2a")); executeTest(String.format("test indel calling, multiple technologies"), spec); } @@ -327,7 +327,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("9cff66a321284c362f393bc4db21f756")); + Arrays.asList("9430fe36789a791fcff6162f768ae563")); executeTest("test MultiSample Pilot2 indels with alleles passed in", spec); } @@ -337,7 +337,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { baseCommandIndels + " --output_mode EMIT_ALL_SITES --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + privateTestDir + "indelAllelesForUG.vcf -I " + validationDataLocation + "pilot2_daughters.chr20.10k-11k.bam -o %s -L 20:10,000,000-10,100,000", 1, - Arrays.asList("90c8cfcf65152534c16ed81104fc3bcd")); + Arrays.asList("8d8dbf483526b0b309f5728619a74a86")); executeTest("test MultiSample Pilot2 indels with alleles passed in and emitting all sites", spec); } @@ -345,13 +345,13 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMultiSampleIndels1() { WalkerTest.WalkerTestSpec spec1 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("457b8f899cf1665de61e75084dbb79d0")); + Arrays.asList("5667a699a3a13474f2d1cd2d6b01cd5b")); List result = executeTest("test MultiSample Pilot1 CEU indels", spec1).getFirst(); WalkerTest.WalkerTestSpec spec2 = new WalkerTest.WalkerTestSpec( baseCommandIndels + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -alleles " + result.get(0).getAbsolutePath() + " -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -o %s -L 1:10450700-10551000", 1, - Arrays.asList("a13fe7aa3b9e8e091b3cf3442a056ec1")); + Arrays.asList("b6c1d5cd28ff584c5f5037afef4e883a")); executeTest("test MultiSample Pilot1 CEU indels using GENOTYPE_GIVEN_ALLELES", spec2); } @@ -361,7 +361,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommandIndelsb37 + " --genotyping_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + privateTestDir + vcf + " -I " + validationDataLocation + "NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.20.bam -o %s -L " + validationDataLocation + vcf, 1, - Arrays.asList("d075ad318739c8c56bdce857da1e48b9")); + Arrays.asList("d76eacc4021b78ccc0a9026162e814a7")); executeTest("test GENOTYPE_GIVEN_ALLELES with no evidence in reads", spec); } @@ -373,7 +373,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { " -o %s" + " -L 20:10,000,000-10,100,000", 1, - Arrays.asList("91c632ab17a1dd89ed19ebb20324f905")); + Arrays.asList("1e0d2c15546c3b0959b00ffb75488b56")); executeTest(String.format("test UG with base indel quality scores"), spec); } @@ -407,7 +407,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinIndelFraction0() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.0", 1, - Arrays.asList("1d80e135d611fe19e1fb1882aa588a73")); + Arrays.asList("db3026c49a3de7a5cb9a3d77635d0706")); executeTest("test minIndelFraction 0.0", spec); } @@ -415,7 +415,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinIndelFraction25() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 0.25", 1, - Arrays.asList("752139616752902fca13c312d8fe5e22")); + Arrays.asList("7ab8e5ee15ab98d6756b0eea0f4d3798")); executeTest("test minIndelFraction 0.25", spec); } @@ -423,7 +423,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testMinIndelFraction100() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( assessMinIndelFraction + " -minIndelFrac 1", 1, - Arrays.asList("d66b9decf26e1704abda1a919ac149cd")); + Arrays.asList("3f07efb768e08650a7ce333edd4f9a52")); executeTest("test minIndelFraction 1.0", spec); } @@ -437,7 +437,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testNsInCigar() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "testWithNs.bam -o %s -L 8:141813600-141813700 -out_mode EMIT_ALL_SITES", 1, - Arrays.asList("b62ba9777efc05af4c36e2d4ce3ee67c")); + Arrays.asList("4d36969d4f8f1094f1fb6e7e085c19f6")); executeTest("test calling on reads with Ns in CIGAR", spec); } @@ -451,18 +451,18 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T UnifiedGenotyper -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("f72ecd00b2913f63788faa7dabb1d102")); + Arrays.asList("092e42a712afb660ec79ff11c55933e2")); executeTest("test calling on a ReducedRead BAM", spec); } @Test public void testReducedBamSNPs() { - testReducedCalling("SNP", "f059743858004ceee325f2a7761a2362"); + testReducedCalling("SNP", "c0de74ab8f4f14eb3a2c5d55c200ac5f"); } @Test public void testReducedBamINDELs() { - testReducedCalling("INDEL", "04845ba1ec7d8d8b0eab2ca6bdb9c1a6"); + testReducedCalling("INDEL", "1c9aaf65ffaa12bb766855265a1c3f8e"); } @@ -483,7 +483,7 @@ public class UnifiedGenotyperIntegrationTest extends WalkerTest { public void testContaminationDownsampling() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( baseCommand + " -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -o %s -L 1:10,000,000-10,010,000 --contamination_fraction_to_filter 0.20", 1, - Arrays.asList("b500ad5959bce69f888a2fac024647e5")); + Arrays.asList("1f9071466fc40f4c6a0f58ac8e9135fb")); executeTest("test contamination_percentage_to_filter 0.20", spec); } diff --git a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java index bb9efe15d..1683044f0 100644 --- a/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java +++ b/protected/java/test/org/broadinstitute/sting/gatk/walkers/haplotypecaller/HaplotypeCallerIntegrationTest.java @@ -21,19 +21,19 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSample() { - HCTest(CEUTRIO_BAM, "", "7122d4f0ef94c5274aa3047cfebe08ed"); + HCTest(CEUTRIO_BAM, "", "47fdbe5f01d3ce5e53056eea8c488e45"); } @Test public void testHaplotypeCallerSingleSample() { - HCTest(NA12878_BAM, "", "6cd6e6787521c07a7bae98766fd628ab"); + HCTest(NA12878_BAM, "", "a2c63f6e6e51a01019bdbd23125bdb15"); } // TODO -- add more tests for GGA mode, especially with input alleles that are complex variants and/or not trimmed @Test public void testHaplotypeCallerMultiSampleGGA() { HCTest(CEUTRIO_BAM, "--max_alternate_alleles 3 -gt_mode GENOTYPE_GIVEN_ALLELES -out_mode EMIT_ALL_SITES -alleles " + validationDataLocation + "combined.phase1.chr20.raw.indels.sites.vcf", - "44df2a9da4fbd2162ae44c3f2a6ef01f"); + "54b7cc3da3d8349ff4302f99883ab188"); } private void HCTestComplexVariants(String bam, String args, String md5) { @@ -44,7 +44,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerMultiSampleComplex() { - HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "4a413eeb7a75cab0ab5370b4c08dcf8e"); + HCTestComplexVariants(privateTestDir + "AFR.complex.variants.bam", "", "6c0c441b71848c2eea38ab5e2afe1120"); } private void HCTestSymbolicVariants(String bam, String args, String md5) { @@ -55,7 +55,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleSymbolic() { - HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "77cf5b5273828dd1605bb23a5aeafcaa"); + HCTestSymbolicVariants(NA12878_CHR20_BAM, "", "0761ff5cbf279be467833fa6708bf360"); } private void HCTestIndelQualityScores(String bam, String args, String md5) { @@ -66,20 +66,20 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { @Test public void testHaplotypeCallerSingleSampleIndelQualityScores() { - HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "87ca97f90e74caee35c35616c065821c"); + HCTestIndelQualityScores(NA12878_RECALIBRATED_BAM, "", "29f1125df5ab27cc937a144ae08ac735"); } @Test public void HCTestProblematicReadsModifiedInActiveRegions() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "haplotype-problem-4.bam") + " --no_cmdline_in_header -o %s -minPruning 3 -L 4:49139026-49139965"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("3df42d0550b51eb9b55aac61e8b3c452")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("ece627de486aee69d02872891c6cb0ff")); executeTest("HCTestProblematicReadsModifiedInActiveRegions: ", spec); } @Test public void HCTestStructuralIndels() { final String base = String.format("-T HaplotypeCaller -R %s -I %s", REF, privateTestDir + "AFR.structural.indels.bam") + " --no_cmdline_in_header -o %s -minPruning 6 -L 20:8187565-8187800 -L 20:18670537-18670730"; - final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("4dbc72b72e3e2d9d812d5a398490e213")); + final WalkerTestSpec spec = new WalkerTestSpec(base, Arrays.asList("add0f4f51969b7caeea99005a7ba1aa4")); executeTest("HCTestStructuralIndels: ", spec); } @@ -93,7 +93,7 @@ public class HaplotypeCallerIntegrationTest extends WalkerTest { public void HCTestReducedBam() { WalkerTest.WalkerTestSpec spec = new WalkerTest.WalkerTestSpec( "-T HaplotypeCaller -R " + b37KGReference + " --no_cmdline_in_header -I " + privateTestDir + "bamExample.ReducedRead.ADAnnotation.bam -o %s -L 1:67,225,396-67,288,518", 1, - Arrays.asList("f8c2745bf71f2659a57494fcaa2c103b")); + Arrays.asList("8a400b0c46f41447fcc35a907e34f384")); executeTest("HC calling on a ReducedRead BAM", spec); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java index 24bac9deb..253313f8f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/QualByDepth.java @@ -1,5 +1,6 @@ package org.broadinstitute.sting.gatk.walkers.annotator; +import com.sun.org.apache.bcel.internal.generic.AALOAD; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -68,50 +69,17 @@ public class QualByDepth extends InfoFieldAnnotation implements StandardAnnotati if ( depth == 0 ) return null; - double QD = -10.0 * vc.getLog10PError() / (double)depth; + double altAlleleLength = AverageAltAlleleLength.getMeanAltAlleleLength(vc); + double QD = -10.0 * vc.getLog10PError() / ((double)depth * altAlleleLength); Map map = new HashMap(); - - if ( ! vc.isSNP() && ! vc.isSymbolic() ) { - // adjust for the event length - int averageLengthNum = 0; - int averageLengthDenom = 0; - int refLength = vc.getReference().length(); - for ( Allele a : vc.getAlternateAlleles() ) { - int numAllele = vc.getCalledChrCount(a); - int alleleSize; - if ( a.length() == refLength ) { - // SNP or MNP - byte[] a_bases = a.getBases(); - byte[] ref_bases = vc.getReference().getBases(); - int n_mismatch = 0; - for ( int idx = 0; idx < a_bases.length; idx++ ) { - if ( a_bases[idx] != ref_bases[idx] ) - n_mismatch++; - } - alleleSize = n_mismatch; - } - else if ( a.isSymbolic() ) { - alleleSize = 1; - } else { - alleleSize = Math.abs(refLength-a.length()); - } - averageLengthNum += alleleSize*numAllele; - averageLengthDenom += numAllele; - } - double averageLength = ( (double) averageLengthNum )/averageLengthDenom; - QD /= averageLength; - map.put(getKeyNames().get(1),String.format("%.2f",averageLength)); - } - map.put(getKeyNames().get(0), String.format("%.2f", QD)); return map; } - public List getKeyNames() { return Arrays.asList("QD","AAL"); } + public List getKeyNames() { return Arrays.asList("QD"); } public List getDescriptions() { - return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "Variant Confidence/Quality by Depth"), - new VCFInfoHeaderLine(getKeyNames().get(1), 1, VCFHeaderLineType.Float, "Average Allele Length")); + return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "Variant Confidence/Quality by Depth")); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java index 90e1d5c34..b097e3d34 100755 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/annotator/VariantAnnotatorIntegrationTest.java @@ -32,7 +32,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample2.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("a127623a26bac4c17c9df491e170ed88")); + Arrays.asList("fbfbd4d13b7ba3d76e8e186902e81378")); executeTest("test file has annotations, asking for annotations, #1", spec); } @@ -40,7 +40,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testHasAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("13e24e6b9dfa241df5baa2c3f53415b9")); + Arrays.asList("19aef8914efc497192f89a9038310ca5")); executeTest("test file has annotations, asking for annotations, #2", spec); } @@ -66,7 +66,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking1() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("07cb4d427235878aeec0066d7d298e54")); + Arrays.asList("4f0b8033da18e6cf6e9b8d5d36c21ba2")); executeTest("test file doesn't have annotations, asking for annotations, #1", spec); } @@ -74,7 +74,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoAnnotsAsking2() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,000,000-10,050,000", 1, - Arrays.asList("e579097677d5e56a5776151251947961")); + Arrays.asList("64ca176d587dfa2b3b9dec9f7999305c")); executeTest("test file doesn't have annotations, asking for annotations, #2", spec); } @@ -82,7 +82,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testExcludeAnnotations() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard -XA FisherStrand -XA ReadPosRankSumTest --variant " + privateTestDir + "vcfexample2empty.vcf -I " + validationDataLocation + "low_coverage_CEU.chr1.10k-11k.bam -L 1:10,020,000-10,021,000", 1, - Arrays.asList("348314945436ace71ce6b1a52559d9ee")); + Arrays.asList("f33f417fad98c05d9cd08ffa22943b0f")); executeTest("test exclude annotations", spec); } @@ -90,7 +90,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testOverwritingHeader() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample4.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L 1:10,001,292", 1, - Arrays.asList("ae7930e37a66c0aa4cfe0232736864fe")); + Arrays.asList("0c810f6c4abef9d9dc5513ca872d3d22")); executeTest("test overwriting header", spec); } @@ -98,7 +98,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoReads() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("a0ba056c2625033e5e859fd6bcec1256")); + Arrays.asList("1c423b7730b9805e7b885ece924286e0")); executeTest("not passing it any reads", spec); } @@ -106,7 +106,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testDBTagWithDbsnp() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --dbsnp " + b36dbSNP129 + " -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("0be7da17340111a94e8581ee3808c88a")); + Arrays.asList("54d7d5bb9404652857adf5e50d995f30")); executeTest("getting DB tag with dbSNP", spec); } @@ -114,7 +114,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testMultipleIdsWithDbsnp() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --alwaysAppendDbsnpId --dbsnp " + b36dbSNP129 + " -G Standard --variant " + privateTestDir + "vcfexample3withIDs.vcf -L " + privateTestDir + "vcfexample3withIDs.vcf", 1, - Arrays.asList("e40e625302a496ede42eed61c2ce524b")); + Arrays.asList("5fe63e511061ed4f91d938e72e7e3c39")); executeTest("adding multiple IDs with dbSNP", spec); } @@ -122,7 +122,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testDBTagWithHapMap() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --comp:H3 " + privateTestDir + "fakeHM3.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("cb50876477d3e035b6eda5d720d7ba8d")); + Arrays.asList("cc7184263975595a6e2473d153227146")); executeTest("getting DB tag with HM3", spec); } @@ -130,7 +130,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testNoQuals() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --variant " + privateTestDir + "noQual.vcf -I " + validationDataLocation + "NA12878.1kg.p2.chr1_10mb_11_mb.SLX.bam -L " + privateTestDir + "noQual.vcf -A QualByDepth", 1, - Arrays.asList("458412261d61797d39f802c1e03d63f6")); + Arrays.asList("aea983adc01cd059193538cc30adc17d")); executeTest("test file doesn't have QUALs", spec); } @@ -138,7 +138,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testUsingExpression() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -E foo.AF -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("39defa8108dca9fa3e54b22a7da43f77")); + Arrays.asList("2b0e8cdfd691779befc5ac123d1a1887")); executeTest("using expression", spec); } @@ -146,7 +146,7 @@ public class VariantAnnotatorIntegrationTest extends WalkerTest { public void testUsingExpressionWithID() { WalkerTestSpec spec = new WalkerTestSpec( baseTestString() + " --resource:foo " + privateTestDir + "targetAnnotations.vcf -G Standard --variant " + privateTestDir + "vcfexample3empty.vcf -E foo.ID -L " + privateTestDir + "vcfexample3empty.vcf", 1, - Arrays.asList("a917edd58a0c235e9395bfc2d2020a8c")); + Arrays.asList("3de1d1998203518098ffae233f3e2352")); executeTest("using expression with ID", spec); } From 7b7efa0fffa41159f4c1817db5491ae5409aa73c Mon Sep 17 00:00:00 2001 From: Chris Hartl Date: Fri, 4 Jan 2013 16:47:26 -0500 Subject: [PATCH 21/30] Add in the AAL as an experimental covariate, in case it's wanted. --- .../annotator/AverageAltAlleleLength.java | 92 +++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AverageAltAlleleLength.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AverageAltAlleleLength.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AverageAltAlleleLength.java new file mode 100644 index 000000000..d6768cb37 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/AverageAltAlleleLength.java @@ -0,0 +1,92 @@ +package org.broadinstitute.sting.gatk.walkers.annotator; + +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ActiveRegionBasedAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatible; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation; +import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation; +import org.broadinstitute.sting.utils.genotyper.PerReadAlleleLikelihoodMap; +import org.broadinstitute.variant.variantcontext.Allele; +import org.broadinstitute.variant.variantcontext.Genotype; +import org.broadinstitute.variant.variantcontext.GenotypesContext; +import org.broadinstitute.variant.variantcontext.VariantContext; +import org.broadinstitute.variant.vcf.VCFHeaderLineType; +import org.broadinstitute.variant.vcf.VCFInfoHeaderLine; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Created by IntelliJ IDEA. + * User: chartl + * Date: 1/3/13 + * Time: 11:36 AM + * To change this template use File | Settings | File Templates. + */ +public class AverageAltAlleleLength extends InfoFieldAnnotation implements ActiveRegionBasedAnnotation, ExperimentalAnnotation { + + public List getDescriptions() { + return Arrays.asList(new VCFInfoHeaderLine(getKeyNames().get(0), 1, VCFHeaderLineType.Float, "Average Allele Length")); + } + + public List getKeyNames() { return Arrays.asList("AAL"); } + + public Map annotate(final RefMetaDataTracker tracker, + final AnnotatorCompatible walker, + final ReferenceContext ref, + final Map stratifiedContexts, + final VariantContext vc, + final Map perReadAlleleLikelihoodMap ) { + if ( !vc.hasLog10PError() ) + return null; + + final GenotypesContext genotypes = vc.getGenotypes(); + if ( genotypes == null || genotypes.size() == 0 ) + return null; + + Map map = new HashMap(); + + double length = getMeanAltAlleleLength(vc); + map.put(getKeyNames().get(0),String.format("%.2f",length)); + return map; + } + + public static double getMeanAltAlleleLength(VariantContext vc) { + double averageLength = 1.0; + if ( ! vc.isSNP() && ! vc.isSymbolic() ) { + // adjust for the event length + int averageLengthNum = 0; + int averageLengthDenom = 0; + int refLength = vc.getReference().length(); + for ( Allele a : vc.getAlternateAlleles() ) { + int numAllele = vc.getCalledChrCount(a); + int alleleSize; + if ( a.length() == refLength ) { + // SNP or MNP + byte[] a_bases = a.getBases(); + byte[] ref_bases = vc.getReference().getBases(); + int n_mismatch = 0; + for ( int idx = 0; idx < a_bases.length; idx++ ) { + if ( a_bases[idx] != ref_bases[idx] ) + n_mismatch++; + } + alleleSize = n_mismatch; + } + else if ( a.isSymbolic() ) { + alleleSize = 1; + } else { + alleleSize = Math.abs(refLength-a.length()); + } + averageLengthNum += alleleSize*numAllele; + averageLengthDenom += numAllele; + } + averageLength = ( (double) averageLengthNum )/averageLengthDenom; + } + + return averageLength; + } +} From 01738e70c37a6280be25793bc94cb3b948d59329 Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Fri, 4 Jan 2013 16:53:50 -0500 Subject: [PATCH 22/30] Archive the experimental Active Region Traversals --- .../sting/gatk/GenomeAnalysisEngine.java | 31 +- .../arguments/GATKArgumentCollection.java | 10 - .../ActiveRegionShardDataProvider.java | 58 ---- .../providers/LocusShardDataProvider.java | 16 - .../datasources/reads/ActiveRegionShard.java | 41 --- .../reads/ActiveRegionShardBalancer.java | 32 -- .../sting/gatk/datasources/reads/Shard.java | 4 +- .../gatk/executive/LinearMicroScheduler.java | 23 -- .../sting/gatk/executive/MicroScheduler.java | 8 +- ...ctiveRegionShardTraverseActiveRegions.java | 309 ------------------ ...imentalReadShardTraverseActiveRegions.java | 309 ------------------ .../utils/activeregion/ActiveRegion.java | 18 - .../ExperimentalActiveRegionShardType.java | 14 - .../TraverseActiveRegionsUnitTest.java | 92 +----- 14 files changed, 21 insertions(+), 944 deletions(-) delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ActiveRegionShardDataProvider.java delete mode 100755 public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShard.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShardBalancer.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalActiveRegionShardTraverseActiveRegions.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalReadShardTraverseActiveRegions.java delete mode 100644 public/java/src/org/broadinstitute/sting/utils/activeregion/ExperimentalActiveRegionShardType.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index bee25dc2f..1187039bb 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -570,32 +570,11 @@ public class GenomeAnalysisEngine { else if(walker instanceof ActiveRegionWalker) { if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Active region walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); - - switch(argCollection.activeRegionShardType) { - case LOCUSSHARD: - if(intervals == null) - return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer()); - else - return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new LocusShardBalancer()); - case READSHARD: - // Use the legacy ReadShardBalancer if legacy downsampling is enabled - ShardBalancer readShardBalancer = downsamplingMethod != null && downsamplingMethod.useLegacyDownsampler ? - new LegacyReadShardBalancer() : - new ReadShardBalancer(); - - if(intervals == null) - return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(), readShardBalancer); - else - return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), readShardBalancer); - case ACTIVEREGIONSHARD: - if(intervals == null) - return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new ActiveRegionShardBalancer()); - else - return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new ActiveRegionShardBalancer()); - default: - throw new UserException.CommandLineException("Invalid active region shard type."); - } - } + if(intervals == null) + return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer()); + else + return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new LocusShardBalancer()); + } else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) { // Apply special validation to read pair walkers. if(walker instanceof ReadPairWalker) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index beaeacc85..d9c7c9008 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -37,7 +37,6 @@ import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; import org.broadinstitute.sting.gatk.samples.PedigreeValidationType; import org.broadinstitute.sting.utils.QualityUtils; -import org.broadinstitute.sting.utils.activeregion.ExperimentalActiveRegionShardType; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalSetRule; @@ -449,14 +448,5 @@ public class GATKArgumentCollection { @Hidden public boolean generateShadowBCF = false; // TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed - - // -------------------------------------------------------------------------------------------------------------- - // - // Experimental Active Region Traversal modes - // - // -------------------------------------------------------------------------------------------------------------- - - @Argument(fullName = "active_region_traversal_shard_type", shortName = "active_region_traversal_shard_type", doc = "Choose an experimental shard type for active region traversal, instead of the default LocusShard", required = false) - public ExperimentalActiveRegionShardType activeRegionShardType = ExperimentalActiveRegionShardType.LOCUSSHARD; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ActiveRegionShardDataProvider.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ActiveRegionShardDataProvider.java deleted file mode 100644 index 55e51f934..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ActiveRegionShardDataProvider.java +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.datasources.providers; - -import net.sf.picard.reference.IndexedFastaSequenceFile; -import org.broadinstitute.sting.gatk.ReadProperties; -import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; -import org.broadinstitute.sting.gatk.iterators.LocusIterator; -import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; - -import java.util.Collection; - -/** - * @author Joel Thibault - */ -public class ActiveRegionShardDataProvider extends ShardDataProvider { - final private ReadShardDataProvider readProvider; - final private LocusShardDataProvider locusProvider; - - public ActiveRegionShardDataProvider(Shard shard, ReadProperties sourceInfo, GenomeLocParser genomeLocParser, StingSAMIterator reads, GenomeLoc locus, LocusIterator locusIterator, IndexedFastaSequenceFile reference, Collection rods) { - super(shard, genomeLocParser, reference, rods); // TODO: necessary? - readProvider = new ReadShardDataProvider(shard, genomeLocParser, reads, reference, rods); - locusProvider = new LocusShardDataProvider(shard, sourceInfo, genomeLocParser, locus, locusIterator, reference, rods); - } - - public ReadShardDataProvider getReadShardDataProvider() { - return readProvider; - } - - public LocusShardDataProvider getLocusShardDataProvider(LocusIterator iterator) { - return locusProvider; - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java index 1607469eb..4888b9f41 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java @@ -44,22 +44,6 @@ public class LocusShardDataProvider extends ShardDataProvider { this.locusIterator = locusIterator; } - /** - * Create a data provider based on an input provider - * Used only by ExperimentalReadShardTraverseActiveRegions - * @param dataProvider - * @param sourceInfo - * @param genomeLocParser - * @param locus - * @param locusIterator - */ - public LocusShardDataProvider(ShardDataProvider dataProvider, ReadProperties sourceInfo, GenomeLocParser genomeLocParser, GenomeLoc locus, LocusIterator locusIterator) { - super(dataProvider.getShard(),genomeLocParser,dataProvider.getReference(),dataProvider.getReferenceOrderedData()); - this.sourceInfo = sourceInfo; - this.locus = locus; - this.locusIterator = locusIterator; - } - /** * Returns information about the source of the reads. * @return Info about the source of the reads. diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShard.java deleted file mode 100755 index 381b193e9..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShard.java +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.datasources.reads; - -import net.sf.samtools.SAMFileSpan; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.GenomeLocParser; - -import java.util.List; -import java.util.Map; - -/** - * @author Joel Thibault - */ -public class ActiveRegionShard extends ReadShard { - public ActiveRegionShard(GenomeLocParser parser, SAMDataSource readsDataSource, Map fileSpans, List loci, boolean isUnmapped) { - super(parser, readsDataSource, fileSpans, loci, isUnmapped); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShardBalancer.java deleted file mode 100644 index 338dd1bdf..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShardBalancer.java +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright (c) 2012, The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -package org.broadinstitute.sting.gatk.datasources.reads; - -/** - * @author Joel Thibault - */ -public class ActiveRegionShardBalancer extends ReadShardBalancer { - // TODO ? -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java index 314156af6..e22a7a54d 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java @@ -40,9 +40,7 @@ import java.util.Map; */ public abstract class Shard implements HasGenomeLocation { public enum ShardType { - READ, - LOCUS, - ACTIVEREGION // Used only by ExperimentalActiveRegionShardTraverseActiveRegions + READ, LOCUS } protected final GenomeLocParser parser; // incredibly annoying! diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index 44f9978a6..f3c1ae91c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -2,7 +2,6 @@ package org.broadinstitute.sting.gatk.executive; import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; -import org.broadinstitute.sting.gatk.datasources.providers.ActiveRegionShardDataProvider; import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider; import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; @@ -12,8 +11,6 @@ import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.io.DirectOutputTracker; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; -import org.broadinstitute.sting.gatk.traversals.ExperimentalActiveRegionShardTraverseActiveRegions; -import org.broadinstitute.sting.gatk.traversals.ExperimentalReadShardTraverseActiveRegions; import org.broadinstitute.sting.gatk.traversals.TraversalEngine; import org.broadinstitute.sting.gatk.traversals.TraverseActiveRegions; import org.broadinstitute.sting.gatk.walkers.Walker; @@ -81,18 +78,6 @@ public class LinearMicroScheduler extends MicroScheduler { } windowMaker.close(); } - else if(shard.getShardType() == Shard.ShardType.ACTIVEREGION) { - WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(), - getReadIterator(shard), shard.getGenomeLocs(), SampleUtils.getSAMFileSamples(engine)); - for(WindowMaker.WindowMakerIterator iterator: windowMaker) { - ShardDataProvider dataProvider = new ActiveRegionShardDataProvider(shard,iterator.getSourceInfo(),engine.getGenomeLocParser(),getReadIterator(shard),iterator.getLocus(),iterator,reference,rods); - Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit()); - accumulator.accumulate(dataProvider,result); - dataProvider.close(); - if ( walker.isDone() ) break; - } - windowMaker.close(); - } else { ShardDataProvider dataProvider = new ReadShardDataProvider(shard,engine.getGenomeLocParser(),getReadIterator(shard),reference,rods); Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit()); @@ -108,14 +93,6 @@ public class LinearMicroScheduler extends MicroScheduler { final Object result = ((TraverseActiveRegions) traversalEngine).endTraversal(walker, accumulator.getReduceInit()); accumulator.accumulate(null, result); // Assumes only used with StandardAccumulator } - else if( traversalEngine instanceof ExperimentalReadShardTraverseActiveRegions ) { - final Object result = ((ExperimentalReadShardTraverseActiveRegions) traversalEngine).endTraversal(walker, accumulator.getReduceInit()); - accumulator.accumulate(null, result); // Assumes only used with StandardAccumulator - } - else if( traversalEngine instanceof ExperimentalActiveRegionShardTraverseActiveRegions) { - final Object result = ((ExperimentalActiveRegionShardTraverseActiveRegions) traversalEngine).endTraversal(walker, accumulator.getReduceInit()); - accumulator.accumulate(null, result); // Assumes only used with StandardAccumulator - } Object result = accumulator.finishTraversal(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index 13c11def6..f8aec1489 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -41,7 +41,6 @@ import org.broadinstitute.sting.gatk.traversals.*; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.AutoFormattingTime; import org.broadinstitute.sting.utils.MathUtils; -import org.broadinstitute.sting.utils.activeregion.ExperimentalActiveRegionShardType; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.progressmeter.ProgressMeter; @@ -246,12 +245,7 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { } else if (walker instanceof ReadPairWalker) { return new TraverseReadPairs(); } else if (walker instanceof ActiveRegionWalker) { - switch (engine.getArguments().activeRegionShardType) { - case LOCUSSHARD: return new TraverseActiveRegions(); - case READSHARD: return new ExperimentalReadShardTraverseActiveRegions(); - case ACTIVEREGIONSHARD: return new ExperimentalActiveRegionShardTraverseActiveRegions(); - default: throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type of ActiveRegionWalker."); - } + return new TraverseActiveRegions(); } else { throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type."); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalActiveRegionShardTraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalActiveRegionShardTraverseActiveRegions.java deleted file mode 100644 index 45d132678..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalActiveRegionShardTraverseActiveRegions.java +++ /dev/null @@ -1,309 +0,0 @@ -package org.broadinstitute.sting.gatk.traversals; - -import net.sf.samtools.SAMFileHeader; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.WalkerManager; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.providers.*; -import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.executive.WindowMaker; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension; -import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.activeregion.ActivityProfile; -import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.util.*; - -public class ExperimentalActiveRegionShardTraverseActiveRegions extends TraversalEngine,ActiveRegionShardDataProvider> { - /** - * our log, which we want to capture anything from this class - */ - protected final static Logger logger = Logger.getLogger(TraversalEngine.class); - - private final LinkedList workQueue = new LinkedList(); - private final LinkedList myReads = new LinkedList(); - - @Override - public String getTraversalUnits() { - return "active regions"; - } - - @Override - public T traverse( final ActiveRegionWalker walker, - final ActiveRegionShardDataProvider dataProvider, - T sum) { - logger.debug(String.format("ExperimentalActiveRegionShardTraverseActiveRegions.traverse: Shard is %s", dataProvider)); - - ReadShardDataProvider readDataProvider = dataProvider.getReadShardDataProvider(); - - final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension(); - final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion(); - - final ReadView readView = new ReadView(readDataProvider); - - final List activeRegions = new LinkedList(); - ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions()); - - Shard readShard = readDataProvider.getShard(); - SAMFileHeader header = readShard.getReadProperties().getHeader(); - WindowMaker windowMaker = new WindowMaker(readShard, engine.getGenomeLocParser(), - readView.iterator(), readShard.getGenomeLocs(), SampleUtils.getSAMFileSamples(header)); - - for(WindowMaker.WindowMakerIterator iterator: windowMaker) { - LocusShardDataProvider locusDataProvider = dataProvider.getLocusShardDataProvider(iterator); - final LocusView locusView = new AllLocusView(locusDataProvider); - final LocusReferenceView referenceView = new LocusReferenceView( walker, locusDataProvider ); - ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, locusDataProvider, locusView); - - // We keep processing while the next reference location is within the interval - GenomeLoc prevLoc = null; - while( locusView.hasNext() ) { - final AlignmentContext locus = locusView.next(); - final GenomeLoc location = locus.getLocation(); - - if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) { - // we've move across some interval boundary, restart profile - profile = incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); - } - - readDataProvider.getShard().getReadMetrics().incrementNumIterations(); - - // create reference context. Note that if we have a pileup of "extended events", the context will - // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). - final ReferenceContext refContext = referenceView.getReferenceContext(location); - - // Iterate forward to get all reference ordered data covering this location - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); - - // Call the walkers isActive function for this locus and add them to the list to be integrated later - profile.add(walkerActiveProb(walker, tracker, refContext, locus, location)); - - prevLoc = location; - - printProgress(locus.getLocation()); - } - - locusDataProvider.close(); - } - - windowMaker.close(); - - updateCumulativeMetrics(readDataProvider.getShard()); - - if ( ! profile.isEmpty() ) - incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); - - // add active regions to queue of regions to process - // first check if can merge active regions over shard boundaries - if( !activeRegions.isEmpty() ) { - if( !workQueue.isEmpty() ) { - final ActiveRegion last = workQueue.getLast(); - final ActiveRegion first = activeRegions.get(0); - if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) { - workQueue.removeLast(); - activeRegions.remove(first); - workQueue.addLast(new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension)); - } - } - workQueue.addAll( activeRegions ); - } - - logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); - - // now process the active regions, where possible - boolean emptyQueue = false; - sum = processActiveRegions(walker, sum, emptyQueue); - - return sum; - } - - /** - * Take the individual isActive calls and integrate them into contiguous active regions and - * add these blocks of work to the work queue - * band-pass filter the list of isActive probabilities and turn into active regions - * - * @param profile - * @param activeRegions - * @param activeRegionExtension - * @param maxRegionSize - * @return - */ - private ActivityProfile incorporateActiveRegions(final ActivityProfile profile, - final List activeRegions, - final int activeRegionExtension, - final int maxRegionSize) { - if ( profile.isEmpty() ) - throw new IllegalStateException("trying to incorporate an empty active profile " + profile); - - final ActivityProfile bandPassFiltered = profile.bandPassFilter(); - activeRegions.addAll(bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize )); - return new ActivityProfile( engine.getGenomeLocParser(), profile.hasPresetRegions() ); - } - - - // -------------------------------------------------------------------------------- - // - // simple utility functions - // - // -------------------------------------------------------------------------------- - - private final ActivityProfileResult walkerActiveProb(final ActiveRegionWalker walker, - final RefMetaDataTracker tracker, final ReferenceContext refContext, - final AlignmentContext locus, final GenomeLoc location) { - if ( walker.hasPresetActiveRegions() ) { - return new ActivityProfileResult(location, walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0); - } else { - return walker.isActive( tracker, refContext, locus ); - } - } - - private ReferenceOrderedView getReferenceOrderedView( final ActiveRegionWalker walker, - final LocusShardDataProvider dataProvider, - final LocusView locusView) { - if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) - return new ManagingReferenceOrderedView( dataProvider ); - else - return (RodLocusView)locusView; - } - - // -------------------------------------------------------------------------------- - // - // code to handle processing active regions - // - // -------------------------------------------------------------------------------- - - private T processActiveRegions( final ActiveRegionWalker walker, T sum, boolean emptyQueue ) { - if( walker.activeRegionOutStream != null ) { - writeActiveRegionsToStream(walker); - return sum; - } else { - return callWalkerMapOnActiveRegions(walker, sum, emptyQueue); - } - } - - /** - * Write out each active region to the walker activeRegionOutStream - * - * @param walker - */ - private void writeActiveRegionsToStream( final ActiveRegionWalker walker ) { - // Just want to output the active regions to a file, not actually process them - for( final ActiveRegion activeRegion : workQueue ) { - if( activeRegion.isActive ) { - walker.activeRegionOutStream.println( activeRegion.getLocation() ); - } - } - } - - private T callWalkerMapOnActiveRegions( final ActiveRegionWalker walker, T sum, boolean emptyQueue ) { - final int lastRegionStart = workQueue.getLast().getLocation().getStart(); - final String lastRegionContig = workQueue.getLast().getLocation().getContig(); - - // If we've traversed sufficiently past the beginning of the workQueue we can unload those regions and process them - // TODO can implement parallel traversal here - while( workQueue.peekFirst() != null ) { - ActiveRegion firstRegion = workQueue.getFirst(); - final String firstRegionContig = firstRegion.getLocation().getContig(); - if (emptyQueue || firstRegionContig != lastRegionContig) { - sum = processFirstActiveRegion(sum, walker); - } - else { - final int firstRegionMaxReadStop = walker.wantsExtendedReads() ? firstRegion.getMaxReadStop() : firstRegion.getExtendedMaxReadStop(); - if (lastRegionStart > firstRegionMaxReadStop) { - sum = processFirstActiveRegion( sum, walker ); - } - else { - break; - } - } - } - - return sum; - } - - /** - * Process the first active region and all remaining reads which overlap - * - * Remove the first active region from the queue - * (NB: some reads associated with this active region may have already been processed) - * - * Remove all of these reads from the queue - * (NB: some may be associated with other active regions) - * - * @param sum - * @param walker - * @return - */ - private T processFirstActiveRegion( final T sum, final ActiveRegionWalker walker ) { - final ActiveRegion firstRegion = workQueue.removeFirst(); - - GATKSAMRecord firstRead = myReads.peekFirst(); // don't remove because it may not be placed here - GenomeLoc firstReadLoc = this.engine.getGenomeLocParser().createGenomeLoc( firstRead ); - - while ( firstRegion.getLocation().overlapsP( firstReadLoc ) || - (walker.wantsExtendedReads() && firstRegion.getExtendedLoc().overlapsP( firstReadLoc ))) { - if( firstRegion.getLocation().overlapsP( firstReadLoc ) ) { - // The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region) - long maxOverlap = firstRegion.getLocation().sizeOfOverlap( firstReadLoc ); - ActiveRegion bestRegion = firstRegion; - for( final ActiveRegion otherRegionToTest : workQueue ) { - if( otherRegionToTest.getLocation().sizeOfOverlap(firstReadLoc) >= maxOverlap ) { - maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( firstReadLoc ); - bestRegion = otherRegionToTest; - } - } - bestRegion.add( firstRead ); - - // The read is also added to all other regions in which it overlaps but marked as non-primary - if( walker.wantsNonPrimaryReads() ) { - if( !bestRegion.equals(firstRegion) ) { - firstRegion.add(firstRead); - } - for( final ActiveRegion otherRegionToTest : workQueue ) { - if( !bestRegion.equals(otherRegionToTest) ) { - // check for non-primary vs. extended - if ( otherRegionToTest.getLocation().overlapsP( firstReadLoc ) ) { - otherRegionToTest.add( firstRead ); - } else if ( walker.wantsExtendedReads() && otherRegionToTest.getExtendedLoc().overlapsP( firstReadLoc ) ) { - otherRegionToTest.add( firstRead ); - } - } - } - } - - // check for non-primary vs. extended - } else if( firstRegion.getLocation().overlapsP( firstReadLoc ) ) { - if ( walker.wantsNonPrimaryReads() ) { - firstRegion.add( firstRead ); - } - } else if( walker.wantsExtendedReads() && firstRegion.getExtendedLoc().overlapsP( firstReadLoc )) { - firstRegion.add( firstRead ); - } - - myReads.removeFirst(); - firstRead = myReads.peekFirst(); - firstReadLoc = this.engine.getGenomeLocParser().createGenomeLoc( firstRead ); - } - - logger.debug(">> Map call with " + firstRegion.getReads().size() + " " + (firstRegion.isActive ? "active" : "inactive") + " reads @ " + firstRegion.getLocation() + " with full extent: " + firstRegion.getReferenceLoc()); - final M x = walker.map( firstRegion, null ); - return walker.reduce(x, sum); - } - - /** - * Special function called in LinearMicroScheduler to empty out the work queue. - * Ugly for now but will be cleaned up when we push this functionality more into the engine - */ - public T endTraversal( final Walker walker, T sum) { - boolean emptyQueue = true; - return processActiveRegions((ActiveRegionWalker)walker, sum, emptyQueue); - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalReadShardTraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalReadShardTraverseActiveRegions.java deleted file mode 100644 index 299ee4f56..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalReadShardTraverseActiveRegions.java +++ /dev/null @@ -1,309 +0,0 @@ -package org.broadinstitute.sting.gatk.traversals; - -import net.sf.samtools.SAMFileHeader; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.gatk.WalkerManager; -import org.broadinstitute.sting.gatk.contexts.AlignmentContext; -import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.providers.*; -import org.broadinstitute.sting.gatk.datasources.reads.Shard; -import org.broadinstitute.sting.gatk.executive.WindowMaker; -import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; -import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension; -import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; -import org.broadinstitute.sting.gatk.walkers.DataSource; -import org.broadinstitute.sting.gatk.walkers.Walker; -import org.broadinstitute.sting.utils.GenomeLoc; -import org.broadinstitute.sting.utils.SampleUtils; -import org.broadinstitute.sting.utils.activeregion.ActiveRegion; -import org.broadinstitute.sting.utils.activeregion.ActivityProfile; -import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.util.*; - -public class ExperimentalReadShardTraverseActiveRegions extends TraversalEngine,ReadShardDataProvider> { - /** - * our log, which we want to capture anything from this class - */ - protected final static Logger logger = Logger.getLogger(TraversalEngine.class); - - private final LinkedList workQueue = new LinkedList(); - private final LinkedList myReads = new LinkedList(); - - @Override - public String getTraversalUnits() { - return "active regions"; - } - - @Override - public T traverse( final ActiveRegionWalker walker, - final ReadShardDataProvider readDataProvider, - T sum) { - logger.debug(String.format("ExperimentalReadShardTraverseActiveRegions.traverse: Read Shard is %s", readDataProvider)); - - final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension(); - final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion(); - - final ReadView readView = new ReadView(readDataProvider); - - final List activeRegions = new LinkedList(); - ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions()); - - Shard readShard = readDataProvider.getShard(); - SAMFileHeader header = readShard.getReadProperties().getHeader(); - WindowMaker windowMaker = new WindowMaker(readShard, engine.getGenomeLocParser(), - readView.iterator(), readShard.getGenomeLocs(), SampleUtils.getSAMFileSamples(header)); - - for(WindowMaker.WindowMakerIterator iterator: windowMaker) { - LocusShardDataProvider locusDataProvider = new LocusShardDataProvider(readDataProvider, - iterator.getSourceInfo(), engine.getGenomeLocParser(), iterator.getLocus(), iterator); - - final LocusView locusView = new AllLocusView(locusDataProvider); - final LocusReferenceView referenceView = new LocusReferenceView( walker, locusDataProvider ); - ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, locusDataProvider, locusView); - - // We keep processing while the next reference location is within the interval - GenomeLoc prevLoc = null; - while( locusView.hasNext() ) { - final AlignmentContext locus = locusView.next(); - final GenomeLoc location = locus.getLocation(); - - if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) { - // we've move across some interval boundary, restart profile - profile = incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); - } - - readDataProvider.getShard().getReadMetrics().incrementNumIterations(); - - // create reference context. Note that if we have a pileup of "extended events", the context will - // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). - final ReferenceContext refContext = referenceView.getReferenceContext(location); - - // Iterate forward to get all reference ordered data covering this location - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); - - // Call the walkers isActive function for this locus and add them to the list to be integrated later - profile.add(walkerActiveProb(walker, tracker, refContext, locus, location)); - - prevLoc = location; - - printProgress(locus.getLocation()); - } - - locusDataProvider.close(); - } - - windowMaker.close(); - - updateCumulativeMetrics(readDataProvider.getShard()); - - if ( ! profile.isEmpty() ) - incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); - - // add active regions to queue of regions to process - // first check if can merge active regions over shard boundaries - if( !activeRegions.isEmpty() ) { - if( !workQueue.isEmpty() ) { - final ActiveRegion last = workQueue.getLast(); - final ActiveRegion first = activeRegions.get(0); - if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) { - workQueue.removeLast(); - activeRegions.remove(first); - workQueue.addLast(new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension)); - } - } - workQueue.addAll( activeRegions ); - } - - logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); - - // now process the active regions, where possible - boolean emptyQueue = false; - sum = processActiveRegions(walker, sum, emptyQueue); - - return sum; - } - - /** - * Take the individual isActive calls and integrate them into contiguous active regions and - * add these blocks of work to the work queue - * band-pass filter the list of isActive probabilities and turn into active regions - * - * @param profile - * @param activeRegions - * @param activeRegionExtension - * @param maxRegionSize - * @return - */ - private ActivityProfile incorporateActiveRegions(final ActivityProfile profile, - final List activeRegions, - final int activeRegionExtension, - final int maxRegionSize) { - if ( profile.isEmpty() ) - throw new IllegalStateException("trying to incorporate an empty active profile " + profile); - - final ActivityProfile bandPassFiltered = profile.bandPassFilter(); - activeRegions.addAll(bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize )); - return new ActivityProfile( engine.getGenomeLocParser(), profile.hasPresetRegions() ); - } - - - // -------------------------------------------------------------------------------- - // - // simple utility functions - // - // -------------------------------------------------------------------------------- - - private final ActivityProfileResult walkerActiveProb(final ActiveRegionWalker walker, - final RefMetaDataTracker tracker, final ReferenceContext refContext, - final AlignmentContext locus, final GenomeLoc location) { - if ( walker.hasPresetActiveRegions() ) { - return new ActivityProfileResult(location, walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0); - } else { - return walker.isActive( tracker, refContext, locus ); - } - } - - private ReferenceOrderedView getReferenceOrderedView( final ActiveRegionWalker walker, - final LocusShardDataProvider dataProvider, - final LocusView locusView) { - if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) - return new ManagingReferenceOrderedView( dataProvider ); - else - return (RodLocusView)locusView; - } - - // -------------------------------------------------------------------------------- - // - // code to handle processing active regions - // - // -------------------------------------------------------------------------------- - - private T processActiveRegions( final ActiveRegionWalker walker, T sum, boolean emptyQueue ) { - if( walker.activeRegionOutStream != null ) { - writeActiveRegionsToStream(walker); - return sum; - } else { - return callWalkerMapOnActiveRegions(walker, sum, emptyQueue); - } - } - - /** - * Write out each active region to the walker activeRegionOutStream - * - * @param walker - */ - private void writeActiveRegionsToStream( final ActiveRegionWalker walker ) { - // Just want to output the active regions to a file, not actually process them - for( final ActiveRegion activeRegion : workQueue ) { - if( activeRegion.isActive ) { - walker.activeRegionOutStream.println( activeRegion.getLocation() ); - } - } - } - - private T callWalkerMapOnActiveRegions( final ActiveRegionWalker walker, T sum, boolean emptyQueue ) { - final int lastRegionStart = workQueue.getLast().getLocation().getStart(); - final String lastRegionContig = workQueue.getLast().getLocation().getContig(); - - // If we've traversed sufficiently past the beginning of the workQueue we can unload those regions and process them - // TODO can implement parallel traversal here - while( workQueue.peekFirst() != null ) { - ActiveRegion firstRegion = workQueue.getFirst(); - final String firstRegionContig = firstRegion.getLocation().getContig(); - if (emptyQueue || firstRegionContig != lastRegionContig) { - sum = processFirstActiveRegion(sum, walker); - } - else { - final int firstRegionMaxReadStop = walker.wantsExtendedReads() ? firstRegion.getMaxReadStop() : firstRegion.getExtendedMaxReadStop(); - if (lastRegionStart > firstRegionMaxReadStop) { - sum = processFirstActiveRegion( sum, walker ); - } - else { - break; - } - } - } - - return sum; - } - - /** - * Process the first active region and all remaining reads which overlap - * - * Remove the first active region from the queue - * (NB: some reads associated with this active region may have already been processed) - * - * Remove all of these reads from the queue - * (NB: some may be associated with other active regions) - * - * @param sum - * @param walker - * @return - */ - private T processFirstActiveRegion( final T sum, final ActiveRegionWalker walker ) { - final ActiveRegion firstRegion = workQueue.removeFirst(); - - GATKSAMRecord firstRead = myReads.peekFirst(); // don't remove because it may not be placed here - GenomeLoc firstReadLoc = this.engine.getGenomeLocParser().createGenomeLoc( firstRead ); - - while ( firstRegion.getLocation().overlapsP( firstReadLoc ) || - (walker.wantsExtendedReads() && firstRegion.getExtendedLoc().overlapsP( firstReadLoc ))) { - if( firstRegion.getLocation().overlapsP( firstReadLoc ) ) { - // The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region) - long maxOverlap = firstRegion.getLocation().sizeOfOverlap( firstReadLoc ); - ActiveRegion bestRegion = firstRegion; - for( final ActiveRegion otherRegionToTest : workQueue ) { - if( otherRegionToTest.getLocation().sizeOfOverlap(firstReadLoc) >= maxOverlap ) { - maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( firstReadLoc ); - bestRegion = otherRegionToTest; - } - } - bestRegion.add( firstRead ); - - // The read is also added to all other regions in which it overlaps but marked as non-primary - if( walker.wantsNonPrimaryReads() ) { - if( !bestRegion.equals(firstRegion) ) { - firstRegion.add(firstRead); - } - for( final ActiveRegion otherRegionToTest : workQueue ) { - if( !bestRegion.equals(otherRegionToTest) ) { - // check for non-primary vs. extended - if ( otherRegionToTest.getLocation().overlapsP( firstReadLoc ) ) { - otherRegionToTest.add( firstRead ); - } else if ( walker.wantsExtendedReads() && otherRegionToTest.getExtendedLoc().overlapsP( firstReadLoc ) ) { - otherRegionToTest.add( firstRead ); - } - } - } - } - - // check for non-primary vs. extended - } else if( firstRegion.getLocation().overlapsP( firstReadLoc ) ) { - if ( walker.wantsNonPrimaryReads() ) { - firstRegion.add( firstRead ); - } - } else if( walker.wantsExtendedReads() && firstRegion.getExtendedLoc().overlapsP( firstReadLoc )) { - firstRegion.add( firstRead ); - } - - myReads.removeFirst(); - firstRead = myReads.peekFirst(); - firstReadLoc = this.engine.getGenomeLocParser().createGenomeLoc( firstRead ); - } - - logger.debug(">> Map call with " + firstRegion.getReads().size() + " " + (firstRegion.isActive ? "active" : "inactive") + " reads @ " + firstRegion.getLocation() + " with full extent: " + firstRegion.getReferenceLoc()); - final M x = walker.map( firstRegion, null ); - return walker.reduce(x, sum); - } - - /** - * Special function called in LinearMicroScheduler to empty out the work queue. - * Ugly for now but will be cleaned up when we push this functionality more into the engine - */ - public T endTraversal( final Walker walker, T sum) { - boolean emptyQueue = true; - return processActiveRegions((ActiveRegionWalker)walker, sum, emptyQueue); - } -} diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index d1199ad3d..c12dfcee9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -26,11 +26,6 @@ public class ActiveRegion implements HasGenomeLocation { private final GenomeLocParser genomeLocParser; public final boolean isActive; - // maximum stop position of all reads with start position in this active region - // Used only by ExperimentalReadShardTraverseActiveRegions - // NB: these reads may not be associated with this active region! - private int maxReadStop; - public ActiveRegion( final GenomeLoc activeRegionLoc, final boolean isActive, final GenomeLocParser genomeLocParser, final int extension ) { this.activeRegionLoc = activeRegionLoc; this.isActive = isActive; @@ -38,7 +33,6 @@ public class ActiveRegion implements HasGenomeLocation { this.extension = extension; extendedLoc = genomeLocParser.createGenomeLocOnContig(activeRegionLoc.getContig(), activeRegionLoc.getStart() - extension, activeRegionLoc.getStop() + extension); fullExtentReferenceLoc = extendedLoc; - maxReadStop = activeRegionLoc.getStart(); } @Override @@ -99,18 +93,6 @@ public class ActiveRegion implements HasGenomeLocation { public void remove( final GATKSAMRecord read ) { reads.remove( read ); } public void removeAll( final ArrayList readsToRemove ) { reads.removeAll( readsToRemove ); } - public void setMaxReadStop(int maxReadStop) { - this.maxReadStop = maxReadStop; - } - - public int getMaxReadStop() { - return maxReadStop; - } - - public int getExtendedMaxReadStop() { - return maxReadStop + extension; - } - public boolean equalExceptReads(final ActiveRegion other) { if ( activeRegionLoc.compareTo(other.activeRegionLoc) != 0 ) return false; if ( isActive != other.isActive ) return false; diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ExperimentalActiveRegionShardType.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ExperimentalActiveRegionShardType.java deleted file mode 100644 index 1e9a0ee94..000000000 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ExperimentalActiveRegionShardType.java +++ /dev/null @@ -1,14 +0,0 @@ -package org.broadinstitute.sting.utils.activeregion; - -/** - * Created with IntelliJ IDEA. - * User: thibault - * Date: 1/2/13 - * Time: 4:59 PM - * To change this template use File | Settings | File Templates. - */ -public enum ExperimentalActiveRegionShardType { - LOCUSSHARD, // default/legacy type - READSHARD, - ACTIVEREGIONSHARD -} diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java index 0ec4f57f6..645f1ffc4 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java @@ -3,16 +3,10 @@ package org.broadinstitute.sting.gatk.traversals; import com.google.java.contract.PreconditionError; import net.sf.samtools.*; import org.broadinstitute.sting.commandline.Tags; -import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; -import org.broadinstitute.sting.gatk.datasources.providers.ActiveRegionShardDataProvider; -import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider; -import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; -import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.datasources.reads.*; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; -import org.broadinstitute.sting.utils.activeregion.ExperimentalActiveRegionShardType; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -21,6 +15,7 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.executive.WindowMaker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -33,7 +28,6 @@ import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.testng.Assert; -import org.testng.TestException; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; @@ -101,9 +95,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { } } - private final TraverseActiveRegions traverse = new TraverseActiveRegions(); - private final ExperimentalReadShardTraverseActiveRegions readShardTraverse = new ExperimentalReadShardTraverseActiveRegions(); - private final ExperimentalActiveRegionShardTraverseActiveRegions activeRegionShardTraverse = new ExperimentalActiveRegionShardTraverseActiveRegions(); + private final TraverseActiveRegions t = new TraverseActiveRegions(); private IndexedFastaSequenceFile reference; private SAMSequenceDictionary dictionary; @@ -114,8 +106,6 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { private static final String testBAM = "TraverseActiveRegionsUnitTest.bam"; private static final String testBAI = "TraverseActiveRegionsUnitTest.bai"; - private static final ExperimentalActiveRegionShardType shardType = ExperimentalActiveRegionShardType.LOCUSSHARD; - @BeforeClass private void init() throws FileNotFoundException { reference = new CachingIndexedFastaSequenceFile(new File(hg19Reference)); @@ -183,8 +173,8 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { private List getIsActiveIntervals(DummyActiveRegionWalker walker, List intervals) { List activeIntervals = new ArrayList(); - for (ShardDataProvider dataProvider : createDataProviders(intervals, testBAM)) { - traverse(walker, dataProvider, 0); + for (LocusShardDataProvider dataProvider : createDataProviders(intervals, testBAM)) { + t.traverse(walker, dataProvider, 0); activeIntervals.addAll(walker.isActiveCalls); } @@ -421,10 +411,10 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { } private Map getActiveRegions(DummyActiveRegionWalker walker, List intervals) { - for (ShardDataProvider dataProvider : createDataProviders(intervals, testBAM)) - traverse(walker, dataProvider, 0); + for (LocusShardDataProvider dataProvider : createDataProviders(intervals, testBAM)) + t.traverse(walker, dataProvider, 0); - endTraversal(walker, 0); + t.endTraversal(walker, 0); return walker.mappedActiveRegions; } @@ -485,12 +475,10 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { return record; } - private List createDataProviders(List intervals, String bamFile) { + private List createDataProviders(List intervals, String bamFile) { GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); engine.setGenomeLocParser(genomeLocParser); - GATKArgumentCollection arguments = new GATKArgumentCollection(); - arguments.activeRegionShardType = shardType; // make explicit - engine.setArguments(arguments); + t.initialize(engine); Collection samFiles = new ArrayList(); SAMReaderID readerID = new SAMReaderID(new File(bamFile), new Tags()); @@ -498,65 +486,13 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { SAMDataSource dataSource = new SAMDataSource(samFiles, new ThreadAllocation(), null, genomeLocParser); - List providers = new ArrayList(); - - switch (shardType) { - case LOCUSSHARD: - traverse.initialize(engine); - for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new LocusShardBalancer())) { - for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs())) { - providers.add(new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList())); - } - } - break; - case READSHARD: - readShardTraverse.initialize(engine); - for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new ReadShardBalancer())) { - providers.add(new ReadShardDataProvider(shard, genomeLocParser, shard.iterator(), reference, new ArrayList())); - } - break; - case ACTIVEREGIONSHARD: - activeRegionShardTraverse.initialize(engine); - for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new ActiveRegionShardBalancer())) { - for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs())) { - providers.add(new ActiveRegionShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, shard.iterator(), window.getLocus(), window, reference, new ArrayList())); - } - } - break; - default: throw new TestException("Invalid shard type"); + List providers = new ArrayList(); + for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new LocusShardBalancer())) { + for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs())) { + providers.add(new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList())); + } } return providers; } - - private void traverse(DummyActiveRegionWalker walker, ShardDataProvider dataProvider, int i) { - switch (shardType) { - case LOCUSSHARD: - traverse.traverse(walker, (LocusShardDataProvider) dataProvider, i); - break; - case READSHARD: - readShardTraverse.traverse(walker, (ReadShardDataProvider) dataProvider, i); - break; - case ACTIVEREGIONSHARD: - activeRegionShardTraverse.traverse(walker, (ActiveRegionShardDataProvider) dataProvider, i); - break; - default: throw new TestException("Invalid shard type"); - } - } - - private void endTraversal(DummyActiveRegionWalker walker, int i) { - switch (shardType) { - case LOCUSSHARD: - traverse.endTraversal(walker, i); - break; - case READSHARD: - readShardTraverse.endTraversal(walker, i); - break; - case ACTIVEREGIONSHARD: - activeRegionShardTraverse.endTraversal(walker, i); - break; - default: throw new TestException("Invalid shard type"); - } - } - } From 69bf70c42e8196464f9aec2b2c3e64ed9ef8a97d Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 4 Jan 2013 17:13:31 -0500 Subject: [PATCH 23/30] Cleanup and more unit tests for RecalibrationTables in BQSR -- Added unit tests for combining RecalibrationTables. As a side effect now has serious tests for incrementDatumOrPutIfNecessary -- Removed unnecessary enum.index system from RecalibrationTables. -- Moved what were really static utility methods out of RecalibrationEngine and into RecalUtils. --- .../walkers/bqsr/RecalibrationEngine.java | 51 +------- .../sting/utils/recalibration/RecalUtils.java | 66 +++++++++-- .../recalibration/RecalibrationReport.java | 4 +- .../recalibration/RecalibrationTables.java | 28 ++--- .../RecalibrationReportUnitTest.java | 4 +- .../RecalibrationTablesUnitTest.java | 112 ++++++++++++++++-- 6 files changed, 175 insertions(+), 90 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java index 910519031..3fbe66547 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java @@ -27,10 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; import com.google.java.contract.Requires; import org.broadinstitute.sting.utils.collections.NestedIntegerArray; -import org.broadinstitute.sting.utils.recalibration.EventType; -import org.broadinstitute.sting.utils.recalibration.ReadCovariates; -import org.broadinstitute.sting.utils.recalibration.RecalDatum; -import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; +import org.broadinstitute.sting.utils.recalibration.*; import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -128,29 +125,19 @@ public class RecalibrationEngine { final byte qual = recalInfo.getQual(eventType, offset); final double isError = recalInfo.getErrorFraction(eventType, offset); - incrementDatumOrPutIfNecessary(qualityScoreTable, qual, isError, keys[0], keys[1], eventIndex); + RecalUtils.incrementDatumOrPutIfNecessary(qualityScoreTable, qual, isError, keys[0], keys[1], eventIndex); for (int i = 2; i < covariates.length; i++) { if (keys[i] < 0) continue; - incrementDatumOrPutIfNecessary(tables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex); + RecalUtils.incrementDatumOrPutIfNecessary(tables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex); } } } } } - /** - * creates a datum object with one observation and one or zero error - * - * @param reportedQual the quality score reported by the instrument for this base - * @param isError whether or not the observation is an error - * @return a new RecalDatum object with the observation and the error - */ - protected RecalDatum createDatumObject(final byte reportedQual, final double isError) { - return new RecalDatum(1, isError, reportedQual); - } /** * Finalize, if appropriate, all derived data in recalibrationTables. @@ -226,36 +213,4 @@ public class RecalibrationEngine { if ( ! finalized ) throw new IllegalStateException("Cannot get final recalibration tables until finalizeData() has been called"); return finalRecalibrationTables; } - - /** - * Increments the RecalDatum at the specified position in the specified table, or put a new item there - * if there isn't already one. - * - * Does this in a thread-safe way WITHOUT being synchronized: relies on the behavior of NestedIntegerArray.put() - * to return false if another thread inserts a new item at our position in the middle of our put operation. - * - * @param table the table that holds/will hold our item - * @param qual qual for this event - * @param isError error value for this event - * @param keys location in table of our item - */ - protected void incrementDatumOrPutIfNecessary( final NestedIntegerArray table, - final byte qual, - final double isError, - final int... keys ) { - final RecalDatum existingDatum = table.get(keys); - - if ( existingDatum == null ) { - // No existing item, try to put a new one - if ( ! table.put(createDatumObject(qual, isError), keys) ) { - // Failed to put a new item because another thread came along and put an item here first. - // Get the newly-put item and increment it (item is guaranteed to exist at this point) - table.get(keys).increment(1.0, isError); - } - } - else { - // Easy case: already an item here, so increment it - existingDatum.increment(1.0, isError); - } - } } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java index 58327b924..afdbeef9c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java @@ -269,9 +269,9 @@ public class RecalUtils { final ArrayList> columnNames = new ArrayList>(); // initialize the array to hold the column names columnNames.add(new Pair(covariateNameMap.get(requestedCovariates[0]), "%s")); // save the required covariate name so we can reference it in the future - if (tableIndex != RecalibrationTables.TableType.READ_GROUP_TABLE.index) { + if (tableIndex != RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()) { columnNames.add(new Pair(covariateNameMap.get(requestedCovariates[1]), "%s")); // save the required covariate name so we can reference it in the future - if (tableIndex >= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index) { + if (tableIndex >= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()) { columnNames.add(covariateValue); columnNames.add(covariateName); } @@ -279,13 +279,13 @@ public class RecalUtils { columnNames.add(eventType); // the order of these column names is important here columnNames.add(empiricalQuality); - if (tableIndex == RecalibrationTables.TableType.READ_GROUP_TABLE.index) + if (tableIndex == RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()) columnNames.add(estimatedQReported); // only the read group table needs the estimated Q reported columnNames.add(nObservations); columnNames.add(nErrors); final GATKReportTable reportTable; - if (tableIndex <= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index) { + if (tableIndex <= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()) { if(sortByCols) { reportTable = new GATKReportTable("RecalTable" + reportTableIndex++, "", columnNames.size(), GATKReportTable.TableSortingWay.SORT_BY_COLUMN); } else { @@ -295,7 +295,7 @@ public class RecalUtils { reportTable.addColumn(columnName.getFirst(), columnName.getSecond()); rowIndex = 0; // reset the row index since we're starting with a new table } else { - reportTable = result.get(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index); + reportTable = result.get(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()); } final NestedIntegerArray table = recalibrationTables.getTable(tableIndex); @@ -306,9 +306,9 @@ public class RecalUtils { int columnIndex = 0; int keyIndex = 0; reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), requestedCovariates[0].formatKey(keys[keyIndex++])); - if (tableIndex != RecalibrationTables.TableType.READ_GROUP_TABLE.index) { + if (tableIndex != RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()) { reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), requestedCovariates[1].formatKey(keys[keyIndex++])); - if (tableIndex >= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index) { + if (tableIndex >= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal()) { final Covariate covariate = requestedCovariates[tableIndex]; reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), covariate.formatKey(keys[keyIndex++])); @@ -320,7 +320,7 @@ public class RecalUtils { reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), event.toString()); reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getEmpiricalQuality()); - if (tableIndex == RecalibrationTables.TableType.READ_GROUP_TABLE.index) + if (tableIndex == RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()) reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getEstimatedQReported()); // we only add the estimated Q reported in the RG table reportTable.set(rowIndex, columnNames.get(columnIndex++).getFirst(), datum.getNumObservations()); reportTable.set(rowIndex, columnNames.get(columnIndex).getFirst(), datum.getNumMismatches()); @@ -414,7 +414,7 @@ public class RecalUtils { } // add the optional covariates to the delta table - for (int i = RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index; i < requestedCovariates.length; i++) { + for (int i = RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal(); i < requestedCovariates.length; i++) { final NestedIntegerArray covTable = recalibrationTables.getTable(i); for (final NestedIntegerArray.Leaf leaf : covTable.getAllLeaves()) { final int[] covs = new int[4]; @@ -458,9 +458,9 @@ public class RecalUtils { private static List generateValuesFromKeys(final List keys, final Covariate[] covariates, final Map covariateNameMap) { final List values = new ArrayList(4); - values.add(covariates[RecalibrationTables.TableType.READ_GROUP_TABLE.index].formatKey((Integer)keys.get(0))); + values.add(covariates[RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal()].formatKey((Integer)keys.get(0))); final int covariateIndex = (Integer)keys.get(1); - final Covariate covariate = covariateIndex == covariates.length ? covariates[RecalibrationTables.TableType.QUALITY_SCORE_TABLE.index] : covariates[covariateIndex]; + final Covariate covariate = covariateIndex == covariates.length ? covariates[RecalibrationTables.TableType.QUALITY_SCORE_TABLE.ordinal()] : covariates[covariateIndex]; final int covariateKey = (Integer)keys.get(2); values.add(covariate.formatKey(covariateKey)); values.add(covariateNameMap.get(covariate)); @@ -793,4 +793,48 @@ public class RecalUtils { myDatum.combine(row.value); } } + + /** + * Increments the RecalDatum at the specified position in the specified table, or put a new item there + * if there isn't already one. + * + * Does this in a thread-safe way WITHOUT being synchronized: relies on the behavior of NestedIntegerArray.put() + * to return false if another thread inserts a new item at our position in the middle of our put operation. + * + * @param table the table that holds/will hold our item + * @param qual qual for this event + * @param isError error value for this event + * @param keys location in table of our item + */ + public static void incrementDatumOrPutIfNecessary( final NestedIntegerArray table, + final byte qual, + final double isError, + final int... keys ) { + final RecalDatum existingDatum = table.get(keys); + + if ( existingDatum == null ) { + // No existing item, try to put a new one + if ( ! table.put(createDatumObject(qual, isError), keys) ) { + // Failed to put a new item because another thread came along and put an item here first. + // Get the newly-put item and increment it (item is guaranteed to exist at this point) + table.get(keys).increment(1.0, isError); + } + } + else { + // Easy case: already an item here, so increment it + existingDatum.increment(1.0, isError); + } + } + + + /** + * creates a datum object with one observation and one or zero error + * + * @param reportedQual the quality score reported by the instrument for this base + * @param isError whether or not the observation is an error + * @return a new RecalDatum object with the observation and the error + */ + private static RecalDatum createDatumObject(final byte reportedQual, final double isError) { + return new RecalDatum(1, isError, reportedQual); + } } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java index ff0890ff0..12760b195 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java @@ -139,12 +139,12 @@ public class RecalibrationReport { final String covName = (String)reportTable.get(i, RecalUtils.COVARIATE_NAME_COLUMN_NAME); final int covIndex = optionalCovariateIndexes.get(covName); final Object covValue = reportTable.get(i, RecalUtils.COVARIATE_VALUE_COLUMN_NAME); - tempCOVarray[2] = requestedCovariates[RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index + covIndex].keyFromValue(covValue); + tempCOVarray[2] = requestedCovariates[RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + covIndex].keyFromValue(covValue); final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalUtils.EVENT_TYPE_COLUMN_NAME)); tempCOVarray[3] = event.ordinal(); - recalibrationTables.getTable(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index + covIndex).put(getRecalDatum(reportTable, i, false), tempCOVarray); + recalibrationTables.getTable(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + covIndex).put(getRecalDatum(reportTable, i, false), tempCOVarray); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java index a6b1e13b9..05f711dd5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java @@ -42,15 +42,9 @@ import java.util.ArrayList; public final class RecalibrationTables { public enum TableType { - READ_GROUP_TABLE(0), - QUALITY_SCORE_TABLE(1), - OPTIONAL_COVARIATE_TABLES_START(2); - - public final int index; - - private TableType(final int index) { - this.index = index; - } + READ_GROUP_TABLE, + QUALITY_SCORE_TABLE, + OPTIONAL_COVARIATE_TABLES_START; } private final ArrayList> tables; @@ -60,7 +54,7 @@ public final class RecalibrationTables { private final PrintStream log; public RecalibrationTables(final Covariate[] covariates) { - this(covariates, covariates[TableType.READ_GROUP_TABLE.index].maximumKeyValue() + 1, null); + this(covariates, covariates[TableType.READ_GROUP_TABLE.ordinal()].maximumKeyValue() + 1, null); } public RecalibrationTables(final Covariate[] covariates, final int numReadGroups) { @@ -72,31 +66,31 @@ public final class RecalibrationTables { for ( int i = 0; i < covariates.length; i++ ) tables.add(i, null); // initialize so we can set below - qualDimension = covariates[TableType.QUALITY_SCORE_TABLE.index].maximumKeyValue() + 1; + qualDimension = covariates[TableType.QUALITY_SCORE_TABLE.ordinal()].maximumKeyValue() + 1; this.numReadGroups = numReadGroups; this.log = log; - tables.set(TableType.READ_GROUP_TABLE.index, + tables.set(TableType.READ_GROUP_TABLE.ordinal(), log == null ? new NestedIntegerArray(numReadGroups, eventDimension) : new LoggingNestedIntegerArray(log, "READ_GROUP_TABLE", numReadGroups, eventDimension)); - tables.set(TableType.QUALITY_SCORE_TABLE.index, makeQualityScoreTable()); + tables.set(TableType.QUALITY_SCORE_TABLE.ordinal(), makeQualityScoreTable()); - for (int i = TableType.OPTIONAL_COVARIATE_TABLES_START.index; i < covariates.length; i++) + for (int i = TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal(); i < covariates.length; i++) tables.set(i, log == null ? new NestedIntegerArray(numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension) : - new LoggingNestedIntegerArray(log, String.format("OPTIONAL_COVARIATE_TABLE_%d", i - TableType.OPTIONAL_COVARIATE_TABLES_START.index + 1), + new LoggingNestedIntegerArray(log, String.format("OPTIONAL_COVARIATE_TABLE_%d", i - TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + 1), numReadGroups, qualDimension, covariates[i].maximumKeyValue()+1, eventDimension)); } @Ensures("result != null") public NestedIntegerArray getReadGroupTable() { - return getTable(TableType.READ_GROUP_TABLE.index); + return getTable(TableType.READ_GROUP_TABLE.ordinal()); } @Ensures("result != null") public NestedIntegerArray getQualityScoreTable() { - return getTable(TableType.QUALITY_SCORE_TABLE.index); + return getTable(TableType.QUALITY_SCORE_TABLE.ordinal()); } @Ensures("result != null") diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java index aa0419fed..000d9bee7 100644 --- a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java @@ -94,8 +94,8 @@ public class RecalibrationReportUnitTest { qualTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], errorMode.ordinal()); nKeys += 2; for (int j = 0; j < optionalCovariates.size(); j++) { - final NestedIntegerArray covTable = recalibrationTables.getTable(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index + j); - final int covValue = covariates[RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index + j]; + final NestedIntegerArray covTable = recalibrationTables.getTable(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + j); + final int covValue = covariates[RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.ordinal() + j]; if ( covValue >= 0 ) { covTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], covValue, errorMode.ordinal()); nKeys++; diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationTablesUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationTablesUnitTest.java index 93e52ae83..7a947a539 100644 --- a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationTablesUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationTablesUnitTest.java @@ -29,15 +29,46 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.utils.collections.NestedIntegerArray; import org.broadinstitute.sting.utils.recalibration.covariates.*; import org.testng.Assert; +import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; +import java.util.Arrays; +import java.util.List; + public final class RecalibrationTablesUnitTest extends BaseTest { + private RecalibrationTables tables; + private Covariate[] covariates; + private int numReadGroups = 6; + final byte qualByte = 1; + final List combineStates = Arrays.asList(0, 1, 2); + + @BeforeMethod + private void makeTables() { + covariates = RecalibrationTestUtils.makeInitializedStandardCovariates(); + tables = new RecalibrationTables(covariates, numReadGroups); + fillTable(tables); + } + + private void fillTable(final RecalibrationTables tables) { + for ( int iterations = 0; iterations < 10; iterations++ ) { + for ( final EventType et : EventType.values() ) { + for ( final int rg : combineStates) { + final double error = rg % 2 == 0 ? 1 : 0; + RecalUtils.incrementDatumOrPutIfNecessary(tables.getReadGroupTable(), qualByte, error, rg, et.ordinal()); + for ( final int qual : combineStates) { + RecalUtils.incrementDatumOrPutIfNecessary(tables.getQualityScoreTable(), qualByte, error, rg, qual, et.ordinal()); + for ( final int cycle : combineStates) + RecalUtils.incrementDatumOrPutIfNecessary(tables.getTable(2), qualByte, error, rg, qual, cycle, et.ordinal()); + for ( final int context : combineStates) + RecalUtils.incrementDatumOrPutIfNecessary(tables.getTable(3), qualByte, error, rg, qual, context, et.ordinal()); + } + } + } + } + } + @Test public void basicTest() { - final Covariate[] covariates = RecalibrationTestUtils.makeInitializedStandardCovariates(); - final int numReadGroups = 6; - final RecalibrationTables tables = new RecalibrationTables(covariates, numReadGroups); - final Covariate qualCov = covariates[1]; final Covariate cycleCov = covariates[2]; final Covariate contextCov = covariates[3]; @@ -45,11 +76,11 @@ public final class RecalibrationTablesUnitTest extends BaseTest { Assert.assertEquals(tables.numTables(), covariates.length); Assert.assertNotNull(tables.getReadGroupTable()); - Assert.assertEquals(tables.getReadGroupTable(), tables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE.index)); + Assert.assertEquals(tables.getReadGroupTable(), tables.getTable(RecalibrationTables.TableType.READ_GROUP_TABLE.ordinal())); testDimensions(tables.getReadGroupTable(), numReadGroups); Assert.assertNotNull(tables.getQualityScoreTable()); - Assert.assertEquals(tables.getQualityScoreTable(), tables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE.index)); + Assert.assertEquals(tables.getQualityScoreTable(), tables.getTable(RecalibrationTables.TableType.QUALITY_SCORE_TABLE.ordinal())); testDimensions(tables.getQualityScoreTable(), numReadGroups, qualCov.maximumKeyValue() + 1); Assert.assertNotNull(tables.getTable(2)); @@ -72,13 +103,74 @@ public final class RecalibrationTablesUnitTest extends BaseTest { @Test public void basicMakeQualityScoreTable() { - final Covariate[] covariates = RecalibrationTestUtils.makeInitializedStandardCovariates(); - final int numReadGroups = 6; - final RecalibrationTables tables = new RecalibrationTables(covariates, numReadGroups); - final Covariate qualCov = covariates[1]; final NestedIntegerArray copy = tables.makeQualityScoreTable(); testDimensions(copy, numReadGroups, qualCov.maximumKeyValue()+1); Assert.assertEquals(copy.getAllValues().size(), 0); } + + @Test + public void testCombine1() { + final RecalibrationTables merged = new RecalibrationTables(covariates, numReadGroups); + fillTable(merged); + + merged.combine(tables); + + for ( int i = 0; i < tables.numTables(); i++ ) { + NestedIntegerArray table = tables.getTable(i); + NestedIntegerArray mergedTable = merged.getTable(i); + + Assert.assertEquals(table.getAllLeaves().size(), mergedTable.getAllLeaves().size()); + for ( final NestedIntegerArray.Leaf leaf : table.getAllLeaves() ) { + final RecalDatum mergedValue = mergedTable.get(leaf.keys); + Assert.assertNotNull(mergedValue); + Assert.assertEquals(mergedValue.getNumObservations(), leaf.value.getNumObservations() * 2); + Assert.assertEquals(mergedValue.getNumMismatches(), leaf.value.getNumMismatches() * 2); + } + } + } + + @Test + public void testCombineEmptyOther() { + final RecalibrationTables merged = new RecalibrationTables(covariates, numReadGroups); + + merged.combine(tables); + + for ( int i = 0; i < tables.numTables(); i++ ) { + NestedIntegerArray table = tables.getTable(i); + NestedIntegerArray mergedTable = merged.getTable(i); + + Assert.assertEquals(table.getAllLeaves().size(), mergedTable.getAllLeaves().size()); + for ( final NestedIntegerArray.Leaf leaf : table.getAllLeaves() ) { + final RecalDatum mergedValue = mergedTable.get(leaf.keys); + Assert.assertNotNull(mergedValue); + Assert.assertEquals(mergedValue.getNumObservations(), leaf.value.getNumObservations()); + Assert.assertEquals(mergedValue.getNumMismatches(), leaf.value.getNumMismatches()); + } + } + } + + @Test + public void testCombinePartial() { + final RecalibrationTables merged = new RecalibrationTables(covariates, numReadGroups); + for ( final int rg : combineStates) { + RecalUtils.incrementDatumOrPutIfNecessary(merged.getTable(3), qualByte, 1, rg, 0, 0, 0); + } + + merged.combine(tables); + for ( int i = 0; i < tables.numTables(); i++ ) { + NestedIntegerArray table = tables.getTable(i); + NestedIntegerArray mergedTable = merged.getTable(i); + + Assert.assertEquals(table.getAllLeaves().size(), mergedTable.getAllLeaves().size()); + for ( final NestedIntegerArray.Leaf leaf : table.getAllLeaves() ) { + final RecalDatum mergedValue = mergedTable.get(leaf.keys); + Assert.assertNotNull(mergedValue); + + final int delta = i == 3 && leaf.keys[1] == 0 && leaf.keys[2] == 0 && leaf.keys[3] == 0 ? 1 : 0; + Assert.assertEquals(mergedValue.getNumObservations(), leaf.value.getNumObservations() + delta); + Assert.assertEquals(mergedValue.getNumMismatches(), leaf.value.getNumMismatches() + delta); + } + } + } } From 2ab55e4ee7976e49e41d9dba50c74eae6e50612a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 4 Jan 2013 17:14:32 -0500 Subject: [PATCH 24/30] Fixing bug in TraverseDuplicates.printProgress call: only passes in single location of genome loc --- .../sting/gatk/traversals/TraverseDuplicates.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java index 2e43ef8f8..8ebea8b54 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseDuplicates.java @@ -197,7 +197,7 @@ public class TraverseDuplicates extends TraversalEngine Date: Sat, 5 Jan 2013 12:59:18 -0500 Subject: [PATCH 25/30] Make multi-threaded progress meter daemon unit test more robust --- .../utils/progressmeter/ProgressMeterDaemonUnitTest.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java index 420db683e..5c6c675a7 100644 --- a/public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java @@ -96,7 +96,11 @@ public class ProgressMeterDaemonUnitTest extends BaseTest { daemon.done(); Assert.assertTrue(daemon.isDone()); - Assert.assertEquals(meter.progressCalls.size(), ticks, - "Expected " + ticks + " progress calls from daemon thread, but only got " + meter.progressCalls.size() + " with exact calls " + meter.progressCalls); + Assert.assertTrue(meter.progressCalls.size() >= 1, + "Expected at least one progress update call from daemon thread, but only got " + meter.progressCalls.size() + " with exact calls " + meter.progressCalls); + + final int tolerance = (int)Math.ceil(0.8 * meter.progressCalls.size()); + Assert.assertTrue(Math.abs(meter.progressCalls.size() - ticks) <= tolerance, + "Expected " + ticks + " progress calls from daemon thread, but got " + meter.progressCalls.size() + " and a tolerance of only " + tolerance); } } From b73d72fe94d0cf5512eb342d53026e6a9d6f72d6 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sun, 6 Jan 2013 01:56:57 -0500 Subject: [PATCH 26/30] update docs for LEftAlignVariants --- .../sting/gatk/walkers/variantutils/LeftAlignVariants.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java index fbb81fda0..16a9125a5 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/LeftAlignVariants.java @@ -54,9 +54,10 @@ import java.util.*; * Left-aligns indels from a variants file. * *

- * LeftAlignVariants is a tool that takes a VCF file and left-aligns any indels inside it. The same indel can often be + * LeftAlignVariants is a tool that takes a VCF file and left-aligns the indels inside it. The same indel can often be * placed at multiple positions and still represent the same haplotype. While the standard convention with VCF is to * place an indel at the left-most position this doesn't always happen, so this tool can be used to left-align them. + * Note that this tool cannot handle anything other than bi-allelic, simple indels. Complex events are written out unchanged. * *

Input

*

From bf25e151ff5700f9ef7ecf320a168d258e507015 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sun, 6 Jan 2013 12:26:32 -0500 Subject: [PATCH 27/30] Handle long->int precision in Bayesian estimate --- .../sting/utils/recalibration/RecalDatum.java | 18 +++++++++++------ .../recalibration/RecalDatumUnitTest.java | 20 ++++++++----------- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java index 4eb069542..eebc86b6b 100755 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalDatum.java @@ -341,12 +341,22 @@ public class RecalDatum { return log10QempPriorCache[difference]; } - static protected double log10QempLikelihood(final double Qempirical, final long nObservations, final long nErrors) { + static protected double log10QempLikelihood(final double Qempirical, long nObservations, long nErrors) { if ( nObservations == 0 ) return 0.0; + // the binomial code requires ints as input (because it does caching). This should theoretically be fine because + // there is plenty of precision in 2^31 observations, but we need to make sure that we don't have overflow + // before casting down to an int. + if ( nObservations > Integer.MAX_VALUE ) { + // we need to decrease nErrors by the same fraction that we are decreasing nObservations + final double fraction = (double)Integer.MAX_VALUE / (double)nObservations; + nErrors = Math.round((double)nErrors * fraction); + nObservations = Integer.MAX_VALUE; + } + // this is just a straight binomial PDF - double log10Prob = MathUtils.log10BinomialProbability(longToInt(nObservations), longToInt(nErrors), QualityUtils.qualToErrorProbLog10((byte)(int)Qempirical)); + double log10Prob = MathUtils.log10BinomialProbability((int)nObservations, (int)nErrors, QualityUtils.qualToErrorProbLog10((byte)(int)Qempirical)); if ( Double.isInfinite(log10Prob) || Double.isNaN(log10Prob) ) log10Prob = -Double.MAX_VALUE; @@ -355,8 +365,4 @@ public class RecalDatum { return log10Prob; } - - static protected int longToInt(final long l) { - return (l > Integer.MAX_VALUE) ? Integer.MAX_VALUE : (int)l; - } } \ No newline at end of file diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java index 2305fe566..9b2938d80 100644 --- a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalDatumUnitTest.java @@ -206,7 +206,7 @@ public class RecalDatumUnitTest extends BaseTest { } @Test - public void testlog10QempLikelihood() { + public void testBayesianEstimateOfEmpiricalQuality() { final int Qrep = 20; @@ -229,7 +229,7 @@ public class RecalDatumUnitTest extends BaseTest { } @Test - public void testBayesianEstimateOfEmpiricalQuality() { + public void testlog10QempLikelihood() { final double[] Qemps = new double[] { 0.0, 10.0, 20.0, 30.0 }; final int[] observations = new int[] {0, 10, 1000, 1000000}; @@ -248,16 +248,12 @@ public class RecalDatumUnitTest extends BaseTest { } } } - } - @Test - public void testLongToInt() { - long l = new Long((long)Integer.MAX_VALUE); - int i = RecalDatum.longToInt(l); - Assert.assertEquals(i, Integer.MAX_VALUE); - - l++; - i = RecalDatum.longToInt(l); - Assert.assertEquals(i, Integer.MAX_VALUE); + long bigNum = new Long((long)Integer.MAX_VALUE); + bigNum *= 2L; + final double log10likelihood = RecalDatum.log10QempLikelihood(30, bigNum, 100000); + Assert.assertTrue(log10likelihood < 0.0); + Assert.assertFalse(Double.isInfinite(log10likelihood)); + Assert.assertFalse(Double.isNaN(log10likelihood)); } } \ No newline at end of file From ea21dc9cfb57e1e9ac05c9517d3065a3fb6c3288 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sun, 6 Jan 2013 12:44:13 -0500 Subject: [PATCH 28/30] I just committed this - why didn't it work before? Trying again... --- .../broadinstitute/sting/utils/recalibration/RecalUtils.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java index 196f9a115..266ab9673 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java @@ -817,12 +817,12 @@ public class RecalUtils { if ( ! table.put(createDatumObject(qual, isError), keys) ) { // Failed to put a new item because another thread came along and put an item here first. // Get the newly-put item and increment it (item is guaranteed to exist at this point) - table.get(keys).increment(1.0, isError); + table.get(keys).increment(1L, isError); } } else { // Easy case: already an item here, so increment it - existingDatum.increment(1.0, isError); + existingDatum.increment(1L, isError); } } From ef638489d52ecfc95812bf11f9fb0134c967111b Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sun, 6 Jan 2013 14:07:59 -0500 Subject: [PATCH 29/30] Fixing BQSR gatherer test to keep up to date with latest changes --- .../sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java index 220ffa1e1..44bc7de5f 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/BQSRGathererUnitTest.java @@ -42,7 +42,6 @@ public class BQSRGathererUnitTest extends BaseTest { GATKReport originalReport = new GATKReport(recal_original); GATKReport calculatedReport = new GATKReport(output); - // test the Arguments table List columnsToTest = Arrays.asList(RecalUtils.ARGUMENT_COLUMN_NAME, RecalUtils.ARGUMENT_VALUE_COLUMN_NAME); GATKReportTable originalTable = originalReport.getTable(RecalUtils.ARGUMENT_REPORT_TABLE_TITLE); @@ -86,7 +85,9 @@ public class BQSRGathererUnitTest extends BaseTest { for (String column : columnsToTest) { Object actual = calculated.get(new Integer(row), column); Object expected = original.get(row, column); - Assert.assertEquals(actual, expected, "Row: " + row + " Original Table: " + original.getTableName() + " Calc Table: " + calculated.getTableName()); + //if ( !actual.equals(expected) ) + // System.out.println("Row=" + row + " Table=" + original.getTableName() + " Column=" + column + " Expected=" + expected + " Actual=" + actual); + Assert.assertEquals(actual, expected, "Row: " + row + " Original Table: " + original.getTableName() + " Column=" + column); } } From 8822b8e7c8130a8fc5225ed09c968d472d87e5d0 Mon Sep 17 00:00:00 2001 From: Eric Banks Date: Sun, 6 Jan 2013 14:30:45 -0500 Subject: [PATCH 30/30] Moving HelpConstants out of HelpUtils so that we stop getting these ProgramElementDoc errors when com.sun.javadoc cannot load on a user's system. --- .../sting/commandline/CommandLineProgram.java | 4 +- .../sting/gatk/CommandLineGATK.java | 7 +- .../sting/gatk/filters/FilterManager.java | 4 +- .../VariantDataManager.java | 4 +- .../sting/utils/exceptions/UserException.java | 6 +- .../sting/utils/help/ForumAPIUtils.java | 4 +- .../sting/utils/help/GATKDocUtils.java | 2 +- .../sting/utils/help/HelpConstants.java | 81 +++++++++++++++++++ .../sting/utils/help/HelpUtils.java | 9 --- 9 files changed, 95 insertions(+), 26 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/utils/help/HelpConstants.java diff --git a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java index fb15a3722..283b98774 100644 --- a/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java +++ b/public/java/src/org/broadinstitute/sting/commandline/CommandLineProgram.java @@ -32,8 +32,8 @@ import org.apache.log4j.PatternLayout; import org.broadinstitute.sting.gatk.CommandLineGATK; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.help.ApplicationDetails; +import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.sting.utils.help.HelpFormatter; -import org.broadinstitute.sting.utils.help.HelpUtils; import java.io.IOException; import java.util.*; @@ -289,7 +289,7 @@ public abstract class CommandLineProgram { */ private static void printDocumentationReference() { errorPrintf("Visit our website and forum for extensive documentation and answers to %n"); - errorPrintf("commonly asked questions " + HelpUtils.BASE_GATK_URL + "%n"); + errorPrintf("commonly asked questions " + HelpConstants.BASE_GATK_URL + "%n"); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java index d1711ba4c..bfb1b5720 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java +++ b/public/java/src/org/broadinstitute/sting/gatk/CommandLineGATK.java @@ -36,10 +36,7 @@ import org.broadinstitute.sting.gatk.refdata.tracks.FeatureManager; import org.broadinstitute.sting.gatk.walkers.Attribution; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.exceptions.UserException; -import org.broadinstitute.sting.utils.help.ApplicationDetails; -import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.GATKDocUtils; -import org.broadinstitute.sting.utils.help.HelpUtils; +import org.broadinstitute.sting.utils.help.*; import org.broadinstitute.sting.utils.text.TextFormattingUtils; import java.util.*; @@ -161,7 +158,7 @@ public class CommandLineGATK extends CommandLineExecutable { List header = new ArrayList(); header.add(String.format("The Genome Analysis Toolkit (GATK) v%s, Compiled %s",getVersionNumber(), getBuildTime())); header.add("Copyright (c) 2010 The Broad Institute"); - header.add("For support and documentation go to " + HelpUtils.BASE_GATK_URL); + header.add("For support and documentation go to " + HelpConstants.BASE_GATK_URL); return header; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java b/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java index 89099c587..54e0d852c 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/filters/FilterManager.java @@ -27,7 +27,7 @@ package org.broadinstitute.sting.gatk.filters; import org.broadinstitute.sting.utils.classloader.PluginManager; import org.broadinstitute.sting.utils.help.GATKDocUtils; -import org.broadinstitute.sting.utils.help.HelpUtils; +import org.broadinstitute.sting.utils.help.HelpConstants; import java.util.Collection; import java.util.List; @@ -71,7 +71,7 @@ public class FilterManager extends PluginManager { return String.format("Read filter %s not found. Available read filters:%n%n%s%n%n%s",pluginName, userFriendlyListofReadFilters(availableFilters), - "Please consult the GATK Documentation (" + HelpUtils.GATK_DOCS_URL + ") for more information."); + "Please consult the GATK Documentation (" + HelpConstants.GATK_DOCS_URL + ") for more information."); } private String userFriendlyListofReadFilters(List> filters) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java index f18db412f..8e27cd2d3 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantrecalibration/VariantDataManager.java @@ -32,7 +32,7 @@ import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.codecs.vcf.VCFConstants; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; -import org.broadinstitute.sting.utils.help.HelpUtils; +import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.sting.utils.variantcontext.writer.VariantContextWriter; import org.broadinstitute.sting.utils.collections.ExpandingArrayList; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -81,7 +81,7 @@ public class VariantDataManager { final double theSTD = standardDeviation(theMean, iii); logger.info( annotationKeys.get(iii) + String.format(": \t mean = %.2f\t standard deviation = %.2f", theMean, theSTD) ); if( Double.isNaN(theMean) ) { - throw new UserException.BadInput("Values for " + annotationKeys.get(iii) + " annotation not detected for ANY training variant in the input callset. VariantAnnotator may be used to add these annotations. See " + HelpUtils.forumPost("discussion/49/using-variant-annotator")); + throw new UserException.BadInput("Values for " + annotationKeys.get(iii) + " annotation not detected for ANY training variant in the input callset. VariantAnnotator may be used to add these annotations. See " + HelpConstants.forumPost("discussion/49/using-variant-annotator")); } foundZeroVarianceAnnotation = foundZeroVarianceAnnotation || (theSTD < 1E-6); diff --git a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java index 523fd5a97..31ef13148 100755 --- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java +++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java @@ -30,7 +30,7 @@ import net.sf.samtools.SAMSequenceDictionary; import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.help.DocumentedGATKFeature; -import org.broadinstitute.sting.utils.help.HelpUtils; +import org.broadinstitute.sting.utils.help.HelpConstants; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.broadinstitute.sting.utils.variantcontext.VariantContext; @@ -278,7 +278,7 @@ public class UserException extends ReviewedStingException { public static class ReadMissingReadGroup extends MalformedBAM { public ReadMissingReadGroup(SAMRecord read) { - super(read, String.format("Read %s is either missing the read group or its read group is not defined in the BAM header, both of which are required by the GATK. Please use " + HelpUtils.forumPost("discussion/59/companion-utilities-replacereadgroups to fix this problem"), read.getReadName())); + super(read, String.format("Read %s is either missing the read group or its read group is not defined in the BAM header, both of which are required by the GATK. Please use " + HelpConstants.forumPost("discussion/59/companion-utilities-replacereadgroups to fix this problem"), read.getReadName())); } } @@ -354,7 +354,7 @@ public class UserException extends ReviewedStingException { super(String.format("Lexicographically sorted human genome sequence detected in %s." + "\nFor safety's sake the GATK requires human contigs in karyotypic order: 1, 2, ..., 10, 11, ..., 20, 21, 22, X, Y with M either leading or trailing these contigs." + "\nThis is because all distributed GATK resources are sorted in karyotypic order, and your processing will fail when you need to use these files." - + "\nYou can use the ReorderSam utility to fix this problem: " + HelpUtils.forumPost("discussion/58/companion-utilities-reordersam") + + "\nYou can use the ReorderSam utility to fix this problem: " + HelpConstants.forumPost("discussion/58/companion-utilities-reordersam") + "\n %s contigs = %s", name, name, ReadUtils.prettyPrintSequenceRecords(dict))); } diff --git a/public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java b/public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java index 64238dc73..2ce9d9709 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/ForumAPIUtils.java @@ -50,7 +50,7 @@ public class ForumAPIUtils { Gson gson = new Gson(); List output = new ArrayList(); - String text = httpGet(HelpUtils.GATK_FORUM_API_URL + "categories.json?CategoryIdentifier=tool-bulletin&page=1-100000&" + ACCESS_TOKEN + forumKey); + String text = httpGet(HelpConstants.GATK_FORUM_API_URL + "categories.json?CategoryIdentifier=tool-bulletin&page=1-100000&" + ACCESS_TOKEN + forumKey); APIQuery details = gson.fromJson(text, APIQuery.class); ForumDiscussion[] discussions = details.Discussions; @@ -158,7 +158,7 @@ public class ForumAPIUtils { Gson gson = new Gson(); String data = gson.toJson(post.getPostData()); - httpPost(data, HelpUtils.GATK_FORUM_API_URL + "post/discussion.json?" + ACCESS_TOKEN + forumKey); + httpPost(data, HelpConstants.GATK_FORUM_API_URL + "post/discussion.json?" + ACCESS_TOKEN + forumKey); } diff --git a/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java b/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java index 21054a794..4b62dd9ed 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/GATKDocUtils.java @@ -28,7 +28,7 @@ public class GATKDocUtils { /** * The URL root for RELEASED GATKDOC units */ - public final static String URL_ROOT_FOR_RELEASE_GATKDOCS = HelpUtils.GATK_DOCS_URL; + public final static String URL_ROOT_FOR_RELEASE_GATKDOCS = HelpConstants.GATK_DOCS_URL; /** * The URL root for STABLE GATKDOC units */ diff --git a/public/java/src/org/broadinstitute/sting/utils/help/HelpConstants.java b/public/java/src/org/broadinstitute/sting/utils/help/HelpConstants.java new file mode 100644 index 000000000..da66ff33d --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/help/HelpConstants.java @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2011, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.help; + +import com.sun.javadoc.FieldDoc; +import com.sun.javadoc.PackageDoc; +import com.sun.javadoc.ProgramElementDoc; +import org.broadinstitute.sting.utils.classloader.JVMUtils; + +import java.lang.reflect.Field; + +public class HelpConstants { + + public final static String BASE_GATK_URL = "http://www.broadinstitute.org/gatk"; + public final static String GATK_DOCS_URL = BASE_GATK_URL + "/gatkdocs/"; + public final static String GATK_FORUM_URL = "http://gatkforums.broadinstitute.org/"; + public final static String GATK_FORUM_API_URL = "https://gatkforums.broadinstitute.org/api/v1/"; + + public static String forumPost(String post) { + return GATK_FORUM_URL + post; + } + + protected static boolean assignableToClass(ProgramElementDoc classDoc, Class lhsClass, boolean requireConcrete) { + try { + Class type = getClassForDoc(classDoc); + return lhsClass.isAssignableFrom(type) && (!requireConcrete || JVMUtils.isConcrete(type)); + } catch (Throwable t) { + // Ignore errors. + return false; + } + } + + protected static Class getClassForDoc(ProgramElementDoc doc) throws ClassNotFoundException { + return Class.forName(getClassName(doc)); + } + + protected static Field getFieldForFieldDoc(FieldDoc fieldDoc) { + try { + Class clazz = getClassForDoc(fieldDoc.containingClass()); + return JVMUtils.findField(clazz, fieldDoc.name()); + } catch (ClassNotFoundException e) { + throw new RuntimeException(e); + } + } + + /** + * Reconstitute the class name from the given class JavaDoc object. + * + * @param doc the Javadoc model for the given class. + * @return The (string) class name of the given class. + */ + protected static String getClassName(ProgramElementDoc doc) { + PackageDoc containingPackage = doc.containingPackage(); + return containingPackage.name().length() > 0 ? + String.format("%s.%s", containingPackage.name(), doc.name()) : + String.format("%s", doc.name()); + } + +} \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java b/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java index 930bbc996..87c656e21 100644 --- a/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/help/HelpUtils.java @@ -33,15 +33,6 @@ import java.lang.reflect.Field; public class HelpUtils { - public final static String BASE_GATK_URL = "http://www.broadinstitute.org/gatk"; - public final static String GATK_DOCS_URL = BASE_GATK_URL + "/gatkdocs/"; - public final static String GATK_FORUM_URL = "http://gatkforums.broadinstitute.org/"; - public final static String GATK_FORUM_API_URL = "https://gatkforums.broadinstitute.org/api/v1/"; - - public static String forumPost(String post) { - return GATK_FORUM_URL + post; - } - protected static boolean assignableToClass(ProgramElementDoc classDoc, Class lhsClass, boolean requireConcrete) { try { Class type = getClassForDoc(classDoc);