From c1ba12d71a7fa0879dcd4e5a8fcadbd6ece025de Mon Sep 17 00:00:00 2001 From: Tad Jordan Date: Thu, 3 Jan 2013 16:25:57 -0500 Subject: [PATCH 01/18] Added unit test for outputting sorted GATKReport Tables - Made few small modifications to code - Replaced the two arguments in GATKReportTable constructor with an enum used to specify way of sorting the table --- .../sting/gatk/report/GATKReport.java | 9 +- .../sting/gatk/report/GATKReportTable.java | 187 +++++++++--------- .../bqsr/RecalibrationArgumentCollection.java | 2 +- .../diagnostics/ErrorRatePerCycle.java | 2 +- .../varianteval/VariantEvalReportWriter.java | 2 +- .../utils/recalibration/QuantizationInfo.java | 2 +- .../sting/utils/recalibration/RecalUtils.java | 2 +- .../sting/gatk/report/GATKReportUnitTest.java | 88 ++++++++- 8 files changed, 185 insertions(+), 109 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java index e69924930..1451b8cde 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReport.java @@ -117,7 +117,7 @@ public class GATKReport { * @param numColumns the number of columns in this table */ public void addTable(final String tableName, final String tableDescription, final int numColumns) { - addTable(tableName, tableDescription, numColumns, false, false); + addTable(tableName, tableDescription, numColumns, GATKReportTable.TableSortingWay.DO_NOT_SORT); } /** @@ -126,11 +126,10 @@ public class GATKReport { * @param tableName the name of the table * @param tableDescription the description of the table * @param numColumns the number of columns in this table - * @param sortByRowID whether to sort the rows by the row ID - * @param sortByAllColumns whether to sort the rows by all columns starting from leftmost column + * @param sortingWay way to sort table */ - public void addTable(final String tableName, final String tableDescription, final int numColumns, final boolean sortByRowID, final boolean sortByAllColumns) { - GATKReportTable table = new GATKReportTable(tableName, tableDescription, numColumns, sortByRowID, sortByAllColumns); + public void addTable(final String tableName, final String tableDescription, final int numColumns, final GATKReportTable.TableSortingWay sortingWay) { + GATKReportTable table = new GATKReportTable(tableName, tableDescription, numColumns, sortingWay); tables.put(tableName, table); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java index 2bf7c9609..226e50f81 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java +++ b/public/java/src/org/broadinstitute/sting/gatk/report/GATKReportTable.java @@ -46,8 +46,7 @@ public class GATKReportTable { private final String tableName; private final String tableDescription; - private final boolean sortByRowID; - private final boolean sortByAllColumns; + private final TableSortingWay sortingWay; private List underlyingData; private final List columnInfo; @@ -73,6 +72,12 @@ public class GATKReportTable { public int index() { return index; } } + public enum TableSortingWay { + SORT_BY_ROW, + SORT_BY_COLUMN, + DO_NOT_SORT + } + protected enum TableNameHeaderFields { NAME(2), DESCRIPTION(3); @@ -107,10 +112,7 @@ public class GATKReportTable { tableDescription = (tableNameData.length <= TableNameHeaderFields.DESCRIPTION.index()) ? "" : tableNameData[TableNameHeaderFields.DESCRIPTION.index()]; // table may have no description! (and that's okay) // when reading from a file, we do not re-sort the rows - sortByRowID = false; - - // when reading from a file, we do not re-sort the rows - sortByAllColumns = false; + sortingWay = TableSortingWay.DO_NOT_SORT; // initialize the data final int nColumns = Integer.parseInt(tableData[TableDataHeaderFields.COLS.index()]); @@ -181,7 +183,7 @@ public class GATKReportTable { * @param numColumns the number of columns in this table */ public GATKReportTable(final String tableName, final String tableDescription, final int numColumns) { - this(tableName, tableDescription, numColumns, true, false); + this(tableName, tableDescription, numColumns, TableSortingWay.SORT_BY_ROW); } /** @@ -190,10 +192,9 @@ public class GATKReportTable { * @param tableName the name of the table * @param tableDescription the description of the table * @param numColumns the number of columns in this table - * @param sortByRowID whether to sort rows by the row ID (instead of the order in which they were added) - * @param sortByAllColumns whether to sort rows by all columns (instead of the order in which they were added) + * @param sortingWay in what way to sort rows (instead of the order in which they were added) */ - public GATKReportTable(final String tableName, final String tableDescription, final int numColumns, final boolean sortByRowID, final boolean sortByAllColumns) { + public GATKReportTable(final String tableName, final String tableDescription, final int numColumns, final TableSortingWay sortingWay) { if ( !isValidName(tableName) ) { throw new ReviewedStingException("Attempted to set a GATKReportTable name of '" + tableName + "'. GATKReportTable names must be purely alphanumeric - no spaces or special characters are allowed."); } @@ -204,8 +205,7 @@ public class GATKReportTable { this.tableName = tableName; this.tableDescription = tableDescription; - this.sortByRowID = sortByRowID; - this.sortByAllColumns = sortByAllColumns; + this.sortingWay = sortingWay; underlyingData = new ArrayList(INITITAL_ARRAY_SIZE); columnInfo = new ArrayList(numColumns); @@ -218,7 +218,7 @@ public class GATKReportTable { * @param tableToCopy */ public GATKReportTable(final GATKReportTable tableToCopy, final boolean copyData) { - this(tableToCopy.getTableName(), tableToCopy.getTableDescription(), tableToCopy.getNumColumns(), tableToCopy.sortByRowID, tableToCopy.sortByAllColumns); + this(tableToCopy.getTableName(), tableToCopy.getTableDescription(), tableToCopy.getNumColumns(), tableToCopy.sortingWay); for ( final GATKReportColumn column : tableToCopy.getColumnInfo() ) addColumn(column.getColumnName(), column.getFormat()); if ( copyData ) @@ -569,56 +569,53 @@ public class GATKReportTable { out.println(); // write the table body - if ( sortByAllColumns ) { - Collections.sort(underlyingData, new Comparator() { - //INVARIANT the two arrays are of the same length and corresponding elements are of the same type - @Override - public int compare(Object[] objectArr1, Object[] objectArr2) { - final int EQUAL = 0; + switch (sortingWay) { + case SORT_BY_COLUMN: + Collections.sort(underlyingData, new Comparator() { + //INVARIANT the two arrays are of the same length and corresponding elements are of the same type + @Override + public int compare(Object[] objectArr1, Object[] objectArr2) { + final int EQUAL = 0; - int result = EQUAL; + int result = EQUAL; - int l = objectArr1.length; - for (int x = 0; x < l; x++) { - if (objectArr1[x] instanceof Integer) { - result = ((Integer)objectArr1[x]).compareTo((Integer)objectArr2[x]); + int l = objectArr1.length; + for (int x = 0; x < l; x++) { + if (objectArr1[x] instanceof Integer) { + result = ((Integer)objectArr1[x]).compareTo((Integer)objectArr2[x]); + } else if (objectArr1[x] instanceof Double) { + result = ((Double)objectArr1[x]).compareTo((Double)objectArr2[x]); + } else { // default uses String comparison + result = objectArr1[x].toString().compareTo(objectArr2[x].toString()); + } if( result != EQUAL) { return result; } - } else if (objectArr1[x] instanceof Double) { - result = ((Double)objectArr1[x]).compareTo((Double)objectArr2[x]); - if( result != EQUAL) { - return result; - } - } else { // default uses String comparison - result = objectArr1[x].toString().compareTo(objectArr2[x].toString()); - if( result != EQUAL) { - return result; - } } + return result; } - return result; - } - }); - for ( final Object[] row : underlyingData ) - writeRow(out, row); - } else if ( sortByRowID ) { - // make sure that there are exactly the correct number of ID mappings - if ( rowIdToIndex.size() != underlyingData.size() ) - throw new ReviewedStingException("There isn't a 1-to-1 mapping from row ID to index; this can happen when rows are not created consistently"); + }); + for ( final Object[] row : underlyingData ) + writeRow(out, row); + break; + case SORT_BY_ROW: + // make sure that there are exactly the correct number of ID mappings + if ( rowIdToIndex.size() != underlyingData.size() ) + throw new ReviewedStingException("There isn't a 1-to-1 mapping from row ID to index; this can happen when rows are not created consistently"); - final TreeMap sortedMap; - try { - sortedMap = new TreeMap(rowIdToIndex); - } catch (ClassCastException e) { - throw new ReviewedStingException("Unable to sort the rows based on the row IDs because the ID Objects are of different types"); - } - for ( final Map.Entry rowKey : sortedMap.entrySet() ) - writeRow(out, underlyingData.get(rowKey.getValue())); - } else { - for ( final Object[] row : underlyingData ) - writeRow(out, row); - } + final TreeMap sortedMap; + try { + sortedMap = new TreeMap(rowIdToIndex); + } catch (ClassCastException e) { + throw new ReviewedStingException("Unable to sort the rows based on the row IDs because the ID Objects are of different types"); + } + for ( final Map.Entry rowKey : sortedMap.entrySet() ) + writeRow(out, underlyingData.get(rowKey.getValue())); + break; + case DO_NOT_SORT: + for ( final Object[] row : underlyingData ) + writeRow(out, row); + } out.println(); } @@ -735,53 +732,47 @@ public class GATKReportTable { } private List getOrderedRows() { - if ( sortByAllColumns ) { - Collections.sort(underlyingData, new Comparator() { - //INVARIANT the two arrays are of the same length and corresponding elements are of the same type - @Override - public int compare(Object[] objectArr1, Object[] objectArr2) { - final int EQUAL = 0; - int result = EQUAL; - - int l = objectArr1.length; - for (int x = 0; x < l; x++) { - if (objectArr1[x] instanceof Integer) { - result = ((Integer)objectArr1[x]).compareTo((Integer)objectArr2[x]); - if( result != EQUAL) { - return result; + switch (sortingWay) { + case SORT_BY_COLUMN: + Collections.sort(underlyingData, new Comparator() { + //INVARIANT the two arrays are of the same length and corresponding elements are of the same type + @Override + public int compare(Object[] objectArr1, Object[] objectArr2) { + final int EQUAL = 0; + int result = EQUAL; + int l = objectArr1.length; + for (int x = 0; x < l; x++) { + if (objectArr1[x] instanceof Integer) { + result = ((Integer)objectArr1[x]).compareTo((Integer)objectArr2[x]); + } else if (objectArr1[x] instanceof Double) { + result = ((Double)objectArr1[x]).compareTo((Double)objectArr2[x]); + } else { // default uses String comparison + result = objectArr1[x].toString().compareTo(objectArr2[x].toString()); + } + if( result != EQUAL) { + return result; + } } - } else if (objectArr1[x] instanceof Double) { - result = ((Double)objectArr1[x]).compareTo((Double)objectArr2[x]); - if( result != EQUAL) { - return result; - } - } else { // default uses String comparison - result = objectArr1[x].toString().compareTo(objectArr2[x].toString()); - if( result != EQUAL) { - return result; - } - } + return result; } - return result; + }); + return underlyingData; + case SORT_BY_ROW: + final TreeMap sortedMap; + try { + sortedMap = new TreeMap(rowIdToIndex); + } catch (ClassCastException e) { + return underlyingData; } - }); - return underlyingData; - } else if ( !sortByRowID ) { - return underlyingData; + + final List orderedData = new ArrayList(underlyingData.size()); + for ( final int rowKey : sortedMap.values() ) + orderedData.add(underlyingData.get(rowKey)); + + return orderedData; + default: + return underlyingData; } - - final TreeMap sortedMap; - try { - sortedMap = new TreeMap(rowIdToIndex); - } catch (ClassCastException e) { - return underlyingData; - } - - final List orderedData = new ArrayList(underlyingData.size()); - for ( final int rowKey : sortedMap.values() ) - orderedData.add(underlyingData.get(rowKey)); - - return orderedData; } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java index 2f0f976fa..622413b18 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationArgumentCollection.java @@ -207,7 +207,7 @@ public class RecalibrationArgumentCollection { public GATKReportTable generateReportTable(final String covariateNames) { GATKReportTable argumentsTable; if(SORT_BY_ALL_COLUMNS) { - argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2, false, true); + argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2, GATKReportTable.TableSortingWay.SORT_BY_COLUMN); } else { argumentsTable = new GATKReportTable("Arguments", "Recalibration argument collection values used in this run", 2); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java index b4e781e91..5972322f8 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java @@ -124,7 +124,7 @@ public class ErrorRatePerCycle extends LocusWalker { public void initialize() { report = new GATKReport(); - report.addTable(reportName, reportDescription, 6, true, false); + report.addTable(reportName, reportDescription, 6, GATKReportTable.TableSortingWay.SORT_BY_ROW); table = report.getTable(reportName); table.addColumn("readgroup"); table.addColumn("cycle"); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java index 6af70811f..6dad128fe 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java @@ -162,7 +162,7 @@ public class VariantEvalReportWriter { // create the table final String tableName = ve.getSimpleName(); final String tableDesc = ve.getClass().getAnnotation(Analysis.class).description(); - report.addTable(tableName, tableDesc, 1 + stratifiers.size() + (scanner.hasMoltenField() ? 2 : datamap.size()), true, false); + report.addTable(tableName, tableDesc, 1 + stratifiers.size() + (scanner.hasMoltenField() ? 2 : datamap.size()), GATKReportTable.TableSortingWay.SORT_BY_ROW); // grab the table, and add the columns we need to it final GATKReportTable table = report.getTable(tableName); diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java index e0c1261fe..fc942499c 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java @@ -70,7 +70,7 @@ public class QuantizationInfo { public GATKReportTable generateReportTable(boolean sortBycols) { GATKReportTable quantizedTable; if(sortBycols) { - quantizedTable = new GATKReportTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map", 3, false, true); + quantizedTable = new GATKReportTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map", 3, GATKReportTable.TableSortingWay.SORT_BY_COLUMN); } else { quantizedTable = new GATKReportTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map", 3); } diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java index d4e781fdd..58327b924 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalUtils.java @@ -287,7 +287,7 @@ public class RecalUtils { final GATKReportTable reportTable; if (tableIndex <= RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index) { if(sortByCols) { - reportTable = new GATKReportTable("RecalTable" + reportTableIndex++, "", columnNames.size(), false, true); + reportTable = new GATKReportTable("RecalTable" + reportTableIndex++, "", columnNames.size(), GATKReportTable.TableSortingWay.SORT_BY_COLUMN); } else { reportTable = new GATKReportTable("RecalTable" + reportTableIndex++, "", columnNames.size()); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java index d20b70b42..40d8d8ff9 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java @@ -32,6 +32,13 @@ import org.testng.annotations.Test; import java.io.File; import java.io.IOException; import java.io.PrintStream; +import java.util.Random; +import java.io.FileInputStream; +import java.io.DataInputStream; +import java.io.BufferedReader; +import java.io.InputStreamReader; +import java.util.ArrayList; + public class GATKReportUnitTest extends BaseTest { @Test @@ -77,6 +84,85 @@ public class GATKReportUnitTest extends BaseTest { Assert.assertEquals(GATKReportColumn.isRightAlign(value), expected, "right align of '" + value + "'"); } + private GATKReportTable getTableWithRandomValues() { + Random number = new Random(123L); + final int VALUESRANGE = 10; + + GATKReport report = GATKReport.newSimpleReport("TableName", "col1", "col2", "col3"); + GATKReportTable table = new GATKReportTable("testSortingTable", "table with random values sorted by columns", 3, GATKReportTable.TableSortingWay.SORT_BY_COLUMN ); + + final int NUMROWS = 100; + for (int x = 0; x < NUMROWS; x++) { + report.addRow(number.nextInt(VALUESRANGE), number.nextInt(VALUESRANGE), number.nextInt(VALUESRANGE)); + } + return table; + } + + @Test(enabled = true) + public void testSortingByColumn() { + Assert.assertEquals(isSorted(getTableWithRandomValues()), true); + } + + private boolean isSorted(GATKReportTable table) { + boolean result = true; + File testingSortingTableFile = new File("myFile.txt"); + + try { + // Connect print stream to the output stream + PrintStream ps = new PrintStream(testingSortingTableFile); + table.write(ps); + ps.close(); + } + catch (Exception e){ + System.err.println ("Error: " + e.getMessage()); + } + + ArrayList rows = new ArrayList(); + try { + // Open the file + FileInputStream fStream = new FileInputStream(testingSortingTableFile); + // Get the object of DataInputStream + DataInputStream in = new DataInputStream(fStream); + BufferedReader br = new BufferedReader(new InputStreamReader(in)); + String strLine; + //Read File Line By Line + while ((strLine = br.readLine()) != null) { + + String[] parts = strLine.split(" "); + int l = parts.length; + int[] row = new int[l]; + for(int n = 0; n < l; n++) { + row[n] = Integer.parseInt(parts[n]); + } + rows.add(row); + } + //Close the input stream + in.close(); + } catch (Exception e){//Catch exception if any + System.err.println("Error: " + e.getMessage()); + } + for (int x = 1; x < rows.size() && result; x++) { + result = checkRowOrder(rows.get(x - 1), rows.get(x)); + } + return result; + } + + private boolean checkRowOrder(int[] row1, int[] row2) { + int l = row1.length; + final int EQUAL = 0; + + int result = EQUAL; + + for(int x = 0; x < l && ( result <= EQUAL); x++) { + result = ((Integer)row1[x]).compareTo(row2[x]); + } + if (result <= EQUAL) { + return true; + } else { + return false; + } + } + private GATKReportTable makeBasicTable() { GATKReport report = GATKReport.newSimpleReport("TableName", "sample", "value"); GATKReportTable table = report.getTable("TableName"); @@ -168,7 +254,7 @@ public class GATKReportUnitTest extends BaseTest { table.set("RZ", "SomeFloat", 535646345.657453464576); table.set("RZ", "TrueFalse", true); - report1.addTable("Table3", "blah", 1, true, false); + report1.addTable("Table3", "blah", 1, GATKReportTable.TableSortingWay.SORT_BY_ROW); report1.getTable("Table3").addColumn("a"); report1.getTable("Table3").addRowIDMapping("q", 2); report1.getTable("Table3").addRowIDMapping("5", 3); From 47e620dfbcc6c7b38f68d578780acecbeb1f81d0 Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Wed, 19 Dec 2012 14:55:34 -0500 Subject: [PATCH 02/18] Create BAM index to test shard boundaries --- .../TraverseActiveRegionsUnitTest.java | 34 +++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java index 4cda1455e..8f2f2be70 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java @@ -112,7 +112,6 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { dictionary = reference.getSequenceDictionary(); genomeLocParser = new GenomeLocParser(dictionary); - // TODO: test shard boundaries // TODO: reads with indels // TODO: reads which span many regions // TODO: reads which are partially between intervals (in/outside extension) @@ -142,6 +141,9 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { reads.add(buildSAMRecord("boundary_1_post", "1", 1999, 2050)); reads.add(buildSAMRecord("extended_and_np", "1", 990, 1990)); reads.add(buildSAMRecord("outside_intervals", "1", 5000, 6000)); + reads.add(buildSAMRecord("shard_boundary_1_pre", "1", 16300, 16385)); + reads.add(buildSAMRecord("shard_boundary_1_post", "1", 16384, 16400)); + reads.add(buildSAMRecord("shard_boundary_equal", "1", 16355, 16414)); reads.add(buildSAMRecord("simple20", "20", 10025, 10075)); createBAM(reads); @@ -153,7 +155,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { File indexFile = new File(testBAI); indexFile.deleteOnExit(); - SAMFileWriter out = new SAMFileWriterFactory().makeBAMWriter(reads.get(0).getHeader(), true, outFile); + SAMFileWriter out = new SAMFileWriterFactory().setCreateIndex(true).makeBAMWriter(reads.get(0).getHeader(), true, outFile); for (GATKSAMRecord read : ReadUtils.sortReadsByCoordinate(reads)) { out.addAlignment(read); } @@ -272,6 +274,9 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { // boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 // outside_intervals: none + // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 + // shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 + // shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 // simple20: Primary in 20:10000-10100 Map activeRegions = getActiveRegions(walker, intervals); @@ -286,6 +291,12 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); verifyReadMapping(region, "boundary_equal", "boundary_1_post"); + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384)); + verifyReadMapping(region, "shard_boundary_1_pre"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927)); + verifyReadMapping(region, "shard_boundary_1_post", "shard_boundary_equal"); + region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); verifyReadMapping(region, "simple20"); } @@ -309,6 +320,9 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { // boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 // outside_intervals: none + // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 + // shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 + // shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 // simple20: Primary in 20:10000-10100 Map activeRegions = getActiveRegions(walker, intervals); @@ -323,6 +337,12 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); verifyReadMapping(region, "boundary_equal", "boundary_unequal", "boundary_1_pre", "boundary_1_post"); + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384)); + verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927)); + verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); + region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); verifyReadMapping(region, "simple20"); } @@ -347,6 +367,9 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { // boundary_1_post: Non-Primary in 1:1000-1999, Primary in 1:2000-2999 // extended_and_np: Non-Primary in 1:1-999, Primary in 1:1000-1999, Extended in 1:2000-2999 // outside_intervals: none + // shard_boundary_1_pre: Primary in 1:14908-16384, Non-Primary in 1:16385-16927 + // shard_boundary_1_post: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 + // shard_boundary_equal: Non-Primary in 1:14908-16384, Primary in 1:16385-16927 // simple20: Primary in 20:10000-10100 Map activeRegions = getActiveRegions(walker, intervals); @@ -361,6 +384,12 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 2000, 2999)); verifyReadMapping(region, "boundary_equal", "boundary_unequal", "extended_and_np", "boundary_1_pre", "boundary_1_post"); + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 14908, 16384)); + verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); + + region = activeRegions.get(genomeLocParser.createGenomeLoc("1", 16385, 16927)); + verifyReadMapping(region, "shard_boundary_1_pre", "shard_boundary_1_post", "shard_boundary_equal"); + region = activeRegions.get(genomeLocParser.createGenomeLoc("20", 10000, 10100)); verifyReadMapping(region, "simple20"); } @@ -429,6 +458,7 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { protected GATKSAMRecord buildSAMRecord(String readName, String contig, int alignmentStart, int alignmentEnd) { SAMFileHeader header = ArtificialSAMUtils.createDefaultReadGroup(new SAMFileHeader(), "test", "test"); header.setSequenceDictionary(dictionary); + header.setSortOrder(SAMFileHeader.SortOrder.coordinate); GATKSAMRecord record = new GATKSAMRecord(header); record.setReadName(readName); From ffbd4d85f2e0112b32df0bbba00330b00a0806cf Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Fri, 14 Dec 2012 17:37:29 -0500 Subject: [PATCH 03/18] No need to pass fields as parameters --- .../sting/gatk/traversals/TraverseActiveRegions.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 06fc01232..33323ba67 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -227,7 +227,7 @@ public class TraverseActiveRegions extends TraversalEngine extends TraversalEngine reads, final Queue workQueue, final T sum, final ActiveRegionWalker walker ) { + private T processActiveRegion( final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker walker ) { final ArrayList placedReads = new ArrayList(); - for( final GATKSAMRecord read : reads ) { + for( final GATKSAMRecord read : myReads ) { final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); if( activeRegion.getLocation().overlapsP( readLoc ) ) { // The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region) @@ -278,7 +278,7 @@ public class TraverseActiveRegions extends TraversalEngine> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); From 4cc372f53b98dca13a27b6f7130315c87c8074e4 Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Fri, 21 Dec 2012 15:14:50 -0500 Subject: [PATCH 04/18] LocusShardDataProvider doesn't need its own GenomeLocParser --- .../gatk/datasources/providers/LocusShardDataProvider.java | 6 ------ 1 file changed, 6 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java index 55304da34..4888b9f41 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java @@ -22,11 +22,6 @@ public class LocusShardDataProvider extends ShardDataProvider { */ private final ReadProperties sourceInfo; - /** - * The parser, used to create and build new GenomeLocs. - */ - private final GenomeLocParser genomeLocParser; - /** * The particular locus for which data is provided. Should be contained within shard.getGenomeLocs(). */ @@ -45,7 +40,6 @@ public class LocusShardDataProvider extends ShardDataProvider { public LocusShardDataProvider(Shard shard, ReadProperties sourceInfo, GenomeLocParser genomeLocParser, GenomeLoc locus, LocusIterator locusIterator, IndexedFastaSequenceFile reference, Collection rods) { super(shard,genomeLocParser,reference,rods); this.sourceInfo = sourceInfo; - this.genomeLocParser = genomeLocParser; this.locus = locus; this.locusIterator = locusIterator; } From 14a3ac0e3cc227b7df7eb7dba860bfda5d8073e8 Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Wed, 2 Jan 2013 17:01:21 -0500 Subject: [PATCH 05/18] Enable the use of alternate shards --- .../arguments/GATKArgumentCollection.java | 10 + .../gatk/executive/LinearMicroScheduler.java | 10 + .../sting/gatk/executive/MicroScheduler.java | 8 +- ...ctiveRegionShardTraverseActiveRegions.java | 290 ++++++++++++++++++ ...imentalReadShardTraverseActiveRegions.java | 290 ++++++++++++++++++ .../utils/activeregion/ActiveRegion.java | 1 + .../ExperimentalActiveRegionShardType.java | 14 + .../TraverseActiveRegionsUnitTest.java | 5 + 8 files changed, 627 insertions(+), 1 deletion(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalActiveRegionShardTraverseActiveRegions.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalReadShardTraverseActiveRegions.java create mode 100644 public/java/src/org/broadinstitute/sting/utils/activeregion/ExperimentalActiveRegionShardType.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java index d9c7c9008..beaeacc85 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java +++ b/public/java/src/org/broadinstitute/sting/gatk/arguments/GATKArgumentCollection.java @@ -37,6 +37,7 @@ import org.broadinstitute.sting.gatk.downsampling.DownsamplingMethod; import org.broadinstitute.sting.gatk.phonehome.GATKRunReport; import org.broadinstitute.sting.gatk.samples.PedigreeValidationType; import org.broadinstitute.sting.utils.QualityUtils; +import org.broadinstitute.sting.utils.activeregion.ExperimentalActiveRegionShardType; import org.broadinstitute.sting.utils.baq.BAQ; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalSetRule; @@ -448,5 +449,14 @@ public class GATKArgumentCollection { @Hidden public boolean generateShadowBCF = false; // TODO -- remove all code tagged with TODO -- remove me when argument generateShadowBCF is removed + + // -------------------------------------------------------------------------------------------------------------- + // + // Experimental Active Region Traversal modes + // + // -------------------------------------------------------------------------------------------------------------- + + @Argument(fullName = "active_region_traversal_shard_type", shortName = "active_region_traversal_shard_type", doc = "Choose an experimental shard type for active region traversal, instead of the default LocusShard", required = false) + public ExperimentalActiveRegionShardType activeRegionShardType = ExperimentalActiveRegionShardType.LOCUSSHARD; } diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index f3c1ae91c..84d975879 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -11,6 +11,8 @@ import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.io.DirectOutputTracker; import org.broadinstitute.sting.gatk.io.OutputTracker; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; +import org.broadinstitute.sting.gatk.traversals.ExperimentalActiveRegionShardTraverseActiveRegions; +import org.broadinstitute.sting.gatk.traversals.ExperimentalReadShardTraverseActiveRegions; import org.broadinstitute.sting.gatk.traversals.TraversalEngine; import org.broadinstitute.sting.gatk.traversals.TraverseActiveRegions; import org.broadinstitute.sting.gatk.walkers.Walker; @@ -93,6 +95,14 @@ public class LinearMicroScheduler extends MicroScheduler { final Object result = ((TraverseActiveRegions) traversalEngine).endTraversal(walker, accumulator.getReduceInit()); accumulator.accumulate(null, result); // Assumes only used with StandardAccumulator } + else if( traversalEngine instanceof ExperimentalReadShardTraverseActiveRegions ) { + final Object result = ((ExperimentalReadShardTraverseActiveRegions) traversalEngine).endTraversal(walker, accumulator.getReduceInit()); + accumulator.accumulate(null, result); // Assumes only used with StandardAccumulator + } + else if( traversalEngine instanceof ExperimentalActiveRegionShardTraverseActiveRegions) { + final Object result = ((ExperimentalActiveRegionShardTraverseActiveRegions) traversalEngine).endTraversal(walker, accumulator.getReduceInit()); + accumulator.accumulate(null, result); // Assumes only used with StandardAccumulator + } Object result = accumulator.finishTraversal(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java index f8aec1489..13c11def6 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/MicroScheduler.java @@ -41,6 +41,7 @@ import org.broadinstitute.sting.gatk.traversals.*; import org.broadinstitute.sting.gatk.walkers.*; import org.broadinstitute.sting.utils.AutoFormattingTime; import org.broadinstitute.sting.utils.MathUtils; +import org.broadinstitute.sting.utils.activeregion.ExperimentalActiveRegionShardType; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; import org.broadinstitute.sting.utils.progressmeter.ProgressMeter; @@ -245,7 +246,12 @@ public abstract class MicroScheduler implements MicroSchedulerMBean { } else if (walker instanceof ReadPairWalker) { return new TraverseReadPairs(); } else if (walker instanceof ActiveRegionWalker) { - return new TraverseActiveRegions(); + switch (engine.getArguments().activeRegionShardType) { + case LOCUSSHARD: return new TraverseActiveRegions(); + case READSHARD: return new ExperimentalReadShardTraverseActiveRegions(); + case ACTIVEREGIONSHARD: return new ExperimentalActiveRegionShardTraverseActiveRegions(); + default: throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type of ActiveRegionWalker."); + } } else { throw new UnsupportedOperationException("Unable to determine traversal type, the walker is an unknown type."); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalActiveRegionShardTraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalActiveRegionShardTraverseActiveRegions.java new file mode 100644 index 000000000..71cb89ad9 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalActiveRegionShardTraverseActiveRegions.java @@ -0,0 +1,290 @@ +package org.broadinstitute.sting.gatk.traversals; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.WalkerManager; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.providers.*; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension; +import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.activeregion.ActivityProfile; +import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.*; + +public class ExperimentalActiveRegionShardTraverseActiveRegions extends TraversalEngine,LocusShardDataProvider> { + /** + * our log, which we want to capture anything from this class + */ + protected final static Logger logger = Logger.getLogger(TraversalEngine.class); + + private final LinkedList workQueue = new LinkedList(); + private final LinkedHashSet myReads = new LinkedHashSet(); + + @Override + public String getTraversalUnits() { + return "active regions"; + } + + @Override + public T traverse( final ActiveRegionWalker walker, + final LocusShardDataProvider dataProvider, + T sum) { + logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider)); + + final LocusView locusView = new AllLocusView(dataProvider); + + final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); + final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension(); + final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion(); + + int minStart = Integer.MAX_VALUE; + final List activeRegions = new LinkedList(); + ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); + + ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); + + // We keep processing while the next reference location is within the interval + GenomeLoc prevLoc = null; + while( locusView.hasNext() ) { + final AlignmentContext locus = locusView.next(); + final GenomeLoc location = locus.getLocation(); + + // Grab all the previously unseen reads from this pileup and add them to the massive read list + // Note that this must occur before we leave because we are outside the intervals because + // reads may occur outside our intervals but overlap them in the future + // TODO -- this whole HashSet logic should be changed to a linked list of reads with + // TODO -- subsequent pass over them to find the ones overlapping the active regions + for( final PileupElement p : locus.getBasePileup() ) { + final GATKSAMRecord read = p.getRead(); + if( !myReads.contains(read) ) { + myReads.add(read); + } + + // If this is the last pileup for this shard calculate the minimum alignment start so that we know + // which active regions in the work queue are now safe to process + minStart = Math.min(minStart, read.getAlignmentStart()); + } + + // skip this location -- it's not part of our engine intervals + if ( outsideEngineIntervals(location) ) + continue; + + if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) { + // we've move across some interval boundary, restart profile + profile = incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); + } + + dataProvider.getShard().getReadMetrics().incrementNumIterations(); + + // create reference context. Note that if we have a pileup of "extended events", the context will + // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). + final ReferenceContext refContext = referenceView.getReferenceContext(location); + + // Iterate forward to get all reference ordered data covering this location + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); + + // Call the walkers isActive function for this locus and add them to the list to be integrated later + profile.add(walkerActiveProb(walker, tracker, refContext, locus, location)); + + prevLoc = location; + + printProgress(locus.getLocation()); + } + + updateCumulativeMetrics(dataProvider.getShard()); + + if ( ! profile.isEmpty() ) + incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); + + // add active regions to queue of regions to process + // first check if can merge active regions over shard boundaries + if( !activeRegions.isEmpty() ) { + if( !workQueue.isEmpty() ) { + final ActiveRegion last = workQueue.getLast(); + final ActiveRegion first = activeRegions.get(0); + if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) { + workQueue.removeLast(); + activeRegions.remove(first); + workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) ); + } + } + workQueue.addAll( activeRegions ); + } + + logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); + + // now go and process all of the active regions + sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig()); + + return sum; + } + + /** + * Is the loc outside of the intervals being requested for processing by the GATK? + * @param loc + * @return + */ + private boolean outsideEngineIntervals(final GenomeLoc loc) { + return engine.getIntervals() != null && ! engine.getIntervals().overlaps(loc); + } + + /** + * Take the individual isActive calls and integrate them into contiguous active regions and + * add these blocks of work to the work queue + * band-pass filter the list of isActive probabilities and turn into active regions + * + * @param profile + * @param activeRegions + * @param activeRegionExtension + * @param maxRegionSize + * @return + */ + private ActivityProfile incorporateActiveRegions(final ActivityProfile profile, + final List activeRegions, + final int activeRegionExtension, + final int maxRegionSize) { + if ( profile.isEmpty() ) + throw new IllegalStateException("trying to incorporate an empty active profile " + profile); + + final ActivityProfile bandPassFiltered = profile.bandPassFilter(); + activeRegions.addAll(bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize )); + return new ActivityProfile( engine.getGenomeLocParser(), profile.hasPresetRegions() ); + } + + + // -------------------------------------------------------------------------------- + // + // simple utility functions + // + // -------------------------------------------------------------------------------- + + private final ActivityProfileResult walkerActiveProb(final ActiveRegionWalker walker, + final RefMetaDataTracker tracker, final ReferenceContext refContext, + final AlignmentContext locus, final GenomeLoc location) { + if ( walker.hasPresetActiveRegions() ) { + return new ActivityProfileResult(location, walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0); + } else { + return walker.isActive( tracker, refContext, locus ); + } + } + + private ReferenceOrderedView getReferenceOrderedView( final ActiveRegionWalker walker, + final LocusShardDataProvider dataProvider, + final LocusView locusView) { + if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) + return new ManagingReferenceOrderedView( dataProvider ); + else + return (RodLocusView)locusView; + } + + // -------------------------------------------------------------------------------- + // + // code to handle processing active regions + // + // -------------------------------------------------------------------------------- + + private T processActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) { + if( walker.activeRegionOutStream != null ) { + writeActiveRegionsToStream(walker); + return sum; + } else { + return callWalkerMapOnActiveRegions(walker, sum, minStart, currentContig); + } + } + + /** + * Write out each active region to the walker activeRegionOutStream + * + * @param walker + */ + private void writeActiveRegionsToStream( final ActiveRegionWalker walker ) { + // Just want to output the active regions to a file, not actually process them + for( final ActiveRegion activeRegion : workQueue ) { + if( activeRegion.isActive ) { + walker.activeRegionOutStream.println( activeRegion.getLocation() ); + } + } + } + + private T callWalkerMapOnActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) { + // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them + // TODO can implement parallel traversal here + while( workQueue.peek() != null ) { + final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc(); + if ( extendedLoc.getStop() < minStart || (currentContig != null && !workQueue.peek().getExtendedLoc().getContig().equals(currentContig))) { + final ActiveRegion activeRegion = workQueue.remove(); + sum = processActiveRegion(activeRegion, sum, walker); + } else { + break; + } + } + + return sum; + } + + private T processActiveRegion( final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker walker ) { + final ArrayList placedReads = new ArrayList(); + for( final GATKSAMRecord read : myReads ) { + final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); + if( activeRegion.getLocation().overlapsP( readLoc ) ) { + // The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region) + long maxOverlap = activeRegion.getLocation().sizeOfOverlap( readLoc ); + ActiveRegion bestRegion = activeRegion; + for( final ActiveRegion otherRegionToTest : workQueue ) { + if( otherRegionToTest.getLocation().sizeOfOverlap(readLoc) >= maxOverlap ) { + maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( readLoc ); + bestRegion = otherRegionToTest; + } + } + bestRegion.add( read ); + + // The read is also added to all other regions in which it overlaps but marked as non-primary + if( walker.wantsNonPrimaryReads() ) { + if( !bestRegion.equals(activeRegion) ) { + activeRegion.add( read ); + } + for( final ActiveRegion otherRegionToTest : workQueue ) { + if( !bestRegion.equals(otherRegionToTest) ) { + // check for non-primary vs. extended + if ( otherRegionToTest.getLocation().overlapsP( readLoc ) ) { + otherRegionToTest.add( read ); + } else if ( walker.wantsExtendedReads() && otherRegionToTest.getExtendedLoc().overlapsP( readLoc ) ) { + otherRegionToTest.add( read ); + } + } + } + } + placedReads.add( read ); + // check for non-primary vs. extended + } else if( activeRegion.getLocation().overlapsP( readLoc ) ) { + if ( walker.wantsNonPrimaryReads() ) { + activeRegion.add( read ); + } + } else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) { + activeRegion.add( read ); + } + } + myReads.removeAll( placedReads ); // remove all the reads which have been placed into their active region + // WARNING: This hashset relies on reads being exactly equal when they are placed in the list as when they are removed. So the ActiveRegionWalker can't modify the reads in any way. + + logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); + final M x = walker.map(activeRegion, null); + return walker.reduce( x, sum ); + } + + /** + * Special function called in LinearMicroScheduler to empty out the work queue. + * Ugly for now but will be cleaned up when we push this functionality more into the engine + */ + public T endTraversal( final Walker walker, T sum) { + return processActiveRegions((ActiveRegionWalker)walker, sum, Integer.MAX_VALUE, null); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalReadShardTraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalReadShardTraverseActiveRegions.java new file mode 100644 index 000000000..516a7dc34 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalReadShardTraverseActiveRegions.java @@ -0,0 +1,290 @@ +package org.broadinstitute.sting.gatk.traversals; + +import org.apache.log4j.Logger; +import org.broadinstitute.sting.gatk.WalkerManager; +import org.broadinstitute.sting.gatk.contexts.AlignmentContext; +import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.datasources.providers.*; +import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; +import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension; +import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; +import org.broadinstitute.sting.gatk.walkers.DataSource; +import org.broadinstitute.sting.gatk.walkers.Walker; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.activeregion.ActiveRegion; +import org.broadinstitute.sting.utils.activeregion.ActivityProfile; +import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; +import org.broadinstitute.sting.utils.pileup.PileupElement; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; + +import java.util.*; + +public class ExperimentalReadShardTraverseActiveRegions extends TraversalEngine,LocusShardDataProvider> { + /** + * our log, which we want to capture anything from this class + */ + protected final static Logger logger = Logger.getLogger(TraversalEngine.class); + + private final LinkedList workQueue = new LinkedList(); + private final LinkedHashSet myReads = new LinkedHashSet(); + + @Override + public String getTraversalUnits() { + return "active regions"; + } + + @Override + public T traverse( final ActiveRegionWalker walker, + final LocusShardDataProvider dataProvider, + T sum) { + logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider)); + + final LocusView locusView = new AllLocusView(dataProvider); + + final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); + final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension(); + final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion(); + + int minStart = Integer.MAX_VALUE; + final List activeRegions = new LinkedList(); + ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); + + ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); + + // We keep processing while the next reference location is within the interval + GenomeLoc prevLoc = null; + while( locusView.hasNext() ) { + final AlignmentContext locus = locusView.next(); + final GenomeLoc location = locus.getLocation(); + + // Grab all the previously unseen reads from this pileup and add them to the massive read list + // Note that this must occur before we leave because we are outside the intervals because + // reads may occur outside our intervals but overlap them in the future + // TODO -- this whole HashSet logic should be changed to a linked list of reads with + // TODO -- subsequent pass over them to find the ones overlapping the active regions + for( final PileupElement p : locus.getBasePileup() ) { + final GATKSAMRecord read = p.getRead(); + if( !myReads.contains(read) ) { + myReads.add(read); + } + + // If this is the last pileup for this shard calculate the minimum alignment start so that we know + // which active regions in the work queue are now safe to process + minStart = Math.min(minStart, read.getAlignmentStart()); + } + + // skip this location -- it's not part of our engine intervals + if ( outsideEngineIntervals(location) ) + continue; + + if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) { + // we've move across some interval boundary, restart profile + profile = incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); + } + + dataProvider.getShard().getReadMetrics().incrementNumIterations(); + + // create reference context. Note that if we have a pileup of "extended events", the context will + // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). + final ReferenceContext refContext = referenceView.getReferenceContext(location); + + // Iterate forward to get all reference ordered data covering this location + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); + + // Call the walkers isActive function for this locus and add them to the list to be integrated later + profile.add(walkerActiveProb(walker, tracker, refContext, locus, location)); + + prevLoc = location; + + printProgress(locus.getLocation()); + } + + updateCumulativeMetrics(dataProvider.getShard()); + + if ( ! profile.isEmpty() ) + incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); + + // add active regions to queue of regions to process + // first check if can merge active regions over shard boundaries + if( !activeRegions.isEmpty() ) { + if( !workQueue.isEmpty() ) { + final ActiveRegion last = workQueue.getLast(); + final ActiveRegion first = activeRegions.get(0); + if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) { + workQueue.removeLast(); + activeRegions.remove(first); + workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) ); + } + } + workQueue.addAll( activeRegions ); + } + + logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); + + // now go and process all of the active regions + sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig()); + + return sum; + } + + /** + * Is the loc outside of the intervals being requested for processing by the GATK? + * @param loc + * @return + */ + private boolean outsideEngineIntervals(final GenomeLoc loc) { + return engine.getIntervals() != null && ! engine.getIntervals().overlaps(loc); + } + + /** + * Take the individual isActive calls and integrate them into contiguous active regions and + * add these blocks of work to the work queue + * band-pass filter the list of isActive probabilities and turn into active regions + * + * @param profile + * @param activeRegions + * @param activeRegionExtension + * @param maxRegionSize + * @return + */ + private ActivityProfile incorporateActiveRegions(final ActivityProfile profile, + final List activeRegions, + final int activeRegionExtension, + final int maxRegionSize) { + if ( profile.isEmpty() ) + throw new IllegalStateException("trying to incorporate an empty active profile " + profile); + + final ActivityProfile bandPassFiltered = profile.bandPassFilter(); + activeRegions.addAll(bandPassFiltered.createActiveRegions( activeRegionExtension, maxRegionSize )); + return new ActivityProfile( engine.getGenomeLocParser(), profile.hasPresetRegions() ); + } + + + // -------------------------------------------------------------------------------- + // + // simple utility functions + // + // -------------------------------------------------------------------------------- + + private final ActivityProfileResult walkerActiveProb(final ActiveRegionWalker walker, + final RefMetaDataTracker tracker, final ReferenceContext refContext, + final AlignmentContext locus, final GenomeLoc location) { + if ( walker.hasPresetActiveRegions() ) { + return new ActivityProfileResult(location, walker.presetActiveRegions.overlaps(location) ? 1.0 : 0.0); + } else { + return walker.isActive( tracker, refContext, locus ); + } + } + + private ReferenceOrderedView getReferenceOrderedView( final ActiveRegionWalker walker, + final LocusShardDataProvider dataProvider, + final LocusView locusView) { + if ( WalkerManager.getWalkerDataSource(walker) != DataSource.REFERENCE_ORDERED_DATA ) + return new ManagingReferenceOrderedView( dataProvider ); + else + return (RodLocusView)locusView; + } + + // -------------------------------------------------------------------------------- + // + // code to handle processing active regions + // + // -------------------------------------------------------------------------------- + + private T processActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) { + if( walker.activeRegionOutStream != null ) { + writeActiveRegionsToStream(walker); + return sum; + } else { + return callWalkerMapOnActiveRegions(walker, sum, minStart, currentContig); + } + } + + /** + * Write out each active region to the walker activeRegionOutStream + * + * @param walker + */ + private void writeActiveRegionsToStream( final ActiveRegionWalker walker ) { + // Just want to output the active regions to a file, not actually process them + for( final ActiveRegion activeRegion : workQueue ) { + if( activeRegion.isActive ) { + walker.activeRegionOutStream.println( activeRegion.getLocation() ); + } + } + } + + private T callWalkerMapOnActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) { + // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them + // TODO can implement parallel traversal here + while( workQueue.peek() != null ) { + final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc(); + if ( extendedLoc.getStop() < minStart || (currentContig != null && !workQueue.peek().getExtendedLoc().getContig().equals(currentContig))) { + final ActiveRegion activeRegion = workQueue.remove(); + sum = processActiveRegion(activeRegion, sum, walker); + } else { + break; + } + } + + return sum; + } + + private T processActiveRegion( final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker walker ) { + final ArrayList placedReads = new ArrayList(); + for( final GATKSAMRecord read : myReads ) { + final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); + if( activeRegion.getLocation().overlapsP( readLoc ) ) { + // The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region) + long maxOverlap = activeRegion.getLocation().sizeOfOverlap( readLoc ); + ActiveRegion bestRegion = activeRegion; + for( final ActiveRegion otherRegionToTest : workQueue ) { + if( otherRegionToTest.getLocation().sizeOfOverlap(readLoc) >= maxOverlap ) { + maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( readLoc ); + bestRegion = otherRegionToTest; + } + } + bestRegion.add( read ); + + // The read is also added to all other regions in which it overlaps but marked as non-primary + if( walker.wantsNonPrimaryReads() ) { + if( !bestRegion.equals(activeRegion) ) { + activeRegion.add( read ); + } + for( final ActiveRegion otherRegionToTest : workQueue ) { + if( !bestRegion.equals(otherRegionToTest) ) { + // check for non-primary vs. extended + if ( otherRegionToTest.getLocation().overlapsP( readLoc ) ) { + otherRegionToTest.add( read ); + } else if ( walker.wantsExtendedReads() && otherRegionToTest.getExtendedLoc().overlapsP( readLoc ) ) { + otherRegionToTest.add( read ); + } + } + } + } + placedReads.add( read ); + // check for non-primary vs. extended + } else if( activeRegion.getLocation().overlapsP( readLoc ) ) { + if ( walker.wantsNonPrimaryReads() ) { + activeRegion.add( read ); + } + } else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) { + activeRegion.add( read ); + } + } + myReads.removeAll( placedReads ); // remove all the reads which have been placed into their active region + // WARNING: This hashset relies on reads being exactly equal when they are placed in the list as when they are removed. So the ActiveRegionWalker can't modify the reads in any way. + + logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); + final M x = walker.map(activeRegion, null); + return walker.reduce( x, sum ); + } + + /** + * Special function called in LinearMicroScheduler to empty out the work queue. + * Ugly for now but will be cleaned up when we push this functionality more into the engine + */ + public T endTraversal( final Walker walker, T sum) { + return processActiveRegions((ActiveRegionWalker)walker, sum, Integer.MAX_VALUE, null); + } +} diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index c12dfcee9..3e3bb220a 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -30,6 +30,7 @@ public class ActiveRegion implements HasGenomeLocation { this.activeRegionLoc = activeRegionLoc; this.isActive = isActive; this.genomeLocParser = genomeLocParser; + this.extension = extension; extendedLoc = genomeLocParser.createGenomeLocOnContig(activeRegionLoc.getContig(), activeRegionLoc.getStart() - extension, activeRegionLoc.getStop() + extension); fullExtentReferenceLoc = extendedLoc; diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ExperimentalActiveRegionShardType.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ExperimentalActiveRegionShardType.java new file mode 100644 index 000000000..1e9a0ee94 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ExperimentalActiveRegionShardType.java @@ -0,0 +1,14 @@ +package org.broadinstitute.sting.utils.activeregion; + +/** + * Created with IntelliJ IDEA. + * User: thibault + * Date: 1/2/13 + * Time: 4:59 PM + * To change this template use File | Settings | File Templates. + */ +public enum ExperimentalActiveRegionShardType { + LOCUSSHARD, // default/legacy type + READSHARD, + ACTIVEREGIONSHARD +} diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java index 8f2f2be70..645ba3f3f 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java @@ -3,10 +3,12 @@ package org.broadinstitute.sting.gatk.traversals; import com.google.java.contract.PreconditionError; import net.sf.samtools.*; import org.broadinstitute.sting.commandline.Tags; +import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; import org.broadinstitute.sting.gatk.datasources.reads.*; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.utils.GenomeLocSortedSet; import org.broadinstitute.sting.utils.activeregion.ActiveRegionReadState; +import org.broadinstitute.sting.utils.activeregion.ExperimentalActiveRegionShardType; import org.broadinstitute.sting.utils.interval.IntervalMergingRule; import org.broadinstitute.sting.utils.interval.IntervalUtils; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; @@ -478,6 +480,9 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { private List createDataProviders(List intervals, String bamFile) { GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); engine.setGenomeLocParser(genomeLocParser); + GATKArgumentCollection arguments = new GATKArgumentCollection(); + arguments.activeRegionShardType = ExperimentalActiveRegionShardType.LOCUSSHARD; // make explicit + engine.setArguments(arguments); t.initialize(engine); Collection samFiles = new ArrayList(); From e7553545efe0101b4101383e9141afdbfe0fdf6e Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Thu, 3 Jan 2013 13:57:36 -0500 Subject: [PATCH 06/18] Initial updates for ReadShard --- .../sting/gatk/GenomeAnalysisEngine.java | 28 ++- .../providers/LocusShardDataProvider.java | 16 ++ ...imentalReadShardTraverseActiveRegions.java | 235 ++++++++++-------- .../utils/activeregion/ActiveRegion.java | 19 +- .../TraverseActiveRegionsUnitTest.java | 73 ++++-- 5 files changed, 242 insertions(+), 129 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index 1187039bb..f17450247 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -570,11 +570,29 @@ public class GenomeAnalysisEngine { else if(walker instanceof ActiveRegionWalker) { if (readsDataSource.getSortOrder() != SAMFileHeader.SortOrder.coordinate) throw new UserException.MissortedBAM(SAMFileHeader.SortOrder.coordinate, "Active region walkers can only traverse coordinate-sorted data. Please resort your input BAM file(s) or set the Sort Order tag in the header appropriately."); - if(intervals == null) - return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer()); - else - return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new LocusShardBalancer()); - } + + switch(argCollection.activeRegionShardType) { + case LOCUSSHARD: + if(intervals == null) + return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new LocusShardBalancer()); + else + return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new LocusShardBalancer()); + case READSHARD: + // Use the legacy ReadShardBalancer if legacy downsampling is enabled + ShardBalancer readShardBalancer = downsamplingMethod != null && downsamplingMethod.useLegacyDownsampler ? + new LegacyReadShardBalancer() : + new ReadShardBalancer(); + + if(intervals == null) + return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(), readShardBalancer); + else + return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), readShardBalancer); + case ACTIVEREGIONSHARD: + throw new UserException.CommandLineException("Not implemented."); + default: + throw new UserException.CommandLineException("Invalid active region shard type."); + } + } else if(walker instanceof ReadWalker || walker instanceof ReadPairWalker || walker instanceof DuplicateWalker) { // Apply special validation to read pair walkers. if(walker instanceof ReadPairWalker) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java index 4888b9f41..1607469eb 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/LocusShardDataProvider.java @@ -44,6 +44,22 @@ public class LocusShardDataProvider extends ShardDataProvider { this.locusIterator = locusIterator; } + /** + * Create a data provider based on an input provider + * Used only by ExperimentalReadShardTraverseActiveRegions + * @param dataProvider + * @param sourceInfo + * @param genomeLocParser + * @param locus + * @param locusIterator + */ + public LocusShardDataProvider(ShardDataProvider dataProvider, ReadProperties sourceInfo, GenomeLocParser genomeLocParser, GenomeLoc locus, LocusIterator locusIterator) { + super(dataProvider.getShard(),genomeLocParser,dataProvider.getReference(),dataProvider.getReferenceOrderedData()); + this.sourceInfo = sourceInfo; + this.locus = locus; + this.locusIterator = locusIterator; + } + /** * Returns information about the source of the reads. * @return Info about the source of the reads. diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalReadShardTraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalReadShardTraverseActiveRegions.java index 516a7dc34..672d37f7f 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalReadShardTraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalReadShardTraverseActiveRegions.java @@ -1,32 +1,35 @@ package org.broadinstitute.sting.gatk.traversals; +import net.sf.samtools.SAMFileHeader; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.WalkerManager; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.providers.*; +import org.broadinstitute.sting.gatk.datasources.reads.Shard; +import org.broadinstitute.sting.gatk.executive.WindowMaker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension; import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActivityProfile; import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; -import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.*; -public class ExperimentalReadShardTraverseActiveRegions extends TraversalEngine,LocusShardDataProvider> { +public class ExperimentalReadShardTraverseActiveRegions extends TraversalEngine,ReadShardDataProvider> { /** * our log, which we want to capture anything from this class */ protected final static Logger logger = Logger.getLogger(TraversalEngine.class); private final LinkedList workQueue = new LinkedList(); - private final LinkedHashSet myReads = new LinkedHashSet(); + private final LinkedList myReads = new LinkedList(); @Override public String getTraversalUnits() { @@ -35,71 +38,65 @@ public class ExperimentalReadShardTraverseActiveRegions extends TraversalEn @Override public T traverse( final ActiveRegionWalker walker, - final LocusShardDataProvider dataProvider, + final ReadShardDataProvider readDataProvider, T sum) { - logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider)); + logger.debug(String.format("TraverseActiveRegion.traverse: Read Shard is %s", readDataProvider)); - final LocusView locusView = new AllLocusView(dataProvider); - - final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension(); final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion(); - int minStart = Integer.MAX_VALUE; + final ReadView readView = new ReadView(readDataProvider); + final List activeRegions = new LinkedList(); - ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); + ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions()); - ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); + Shard readShard = readDataProvider.getShard(); + SAMFileHeader header = readShard.getReadProperties().getHeader(); + WindowMaker windowMaker = new WindowMaker(readShard, engine.getGenomeLocParser(), + readView.iterator(), readShard.getGenomeLocs(), SampleUtils.getSAMFileSamples(header)); - // We keep processing while the next reference location is within the interval - GenomeLoc prevLoc = null; - while( locusView.hasNext() ) { - final AlignmentContext locus = locusView.next(); - final GenomeLoc location = locus.getLocation(); + for(WindowMaker.WindowMakerIterator iterator: windowMaker) { + LocusShardDataProvider locusDataProvider = new LocusShardDataProvider(readDataProvider, + iterator.getSourceInfo(), engine.getGenomeLocParser(), iterator.getLocus(), iterator); - // Grab all the previously unseen reads from this pileup and add them to the massive read list - // Note that this must occur before we leave because we are outside the intervals because - // reads may occur outside our intervals but overlap them in the future - // TODO -- this whole HashSet logic should be changed to a linked list of reads with - // TODO -- subsequent pass over them to find the ones overlapping the active regions - for( final PileupElement p : locus.getBasePileup() ) { - final GATKSAMRecord read = p.getRead(); - if( !myReads.contains(read) ) { - myReads.add(read); + final LocusView locusView = new AllLocusView(locusDataProvider); + final LocusReferenceView referenceView = new LocusReferenceView( walker, locusDataProvider ); + ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, locusDataProvider, locusView); + + // We keep processing while the next reference location is within the interval + GenomeLoc prevLoc = null; + while( locusView.hasNext() ) { + final AlignmentContext locus = locusView.next(); + final GenomeLoc location = locus.getLocation(); + + if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) { + // we've move across some interval boundary, restart profile + profile = incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); } - // If this is the last pileup for this shard calculate the minimum alignment start so that we know - // which active regions in the work queue are now safe to process - minStart = Math.min(minStart, read.getAlignmentStart()); + readDataProvider.getShard().getReadMetrics().incrementNumIterations(); + + // create reference context. Note that if we have a pileup of "extended events", the context will + // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). + final ReferenceContext refContext = referenceView.getReferenceContext(location); + + // Iterate forward to get all reference ordered data covering this location + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); + + // Call the walkers isActive function for this locus and add them to the list to be integrated later + profile.add(walkerActiveProb(walker, tracker, refContext, locus, location)); + + prevLoc = location; + + printProgress(locus.getLocation()); } - // skip this location -- it's not part of our engine intervals - if ( outsideEngineIntervals(location) ) - continue; - - if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) { - // we've move across some interval boundary, restart profile - profile = incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); - } - - dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - // create reference context. Note that if we have a pileup of "extended events", the context will - // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). - final ReferenceContext refContext = referenceView.getReferenceContext(location); - - // Iterate forward to get all reference ordered data covering this location - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); - - // Call the walkers isActive function for this locus and add them to the list to be integrated later - profile.add(walkerActiveProb(walker, tracker, refContext, locus, location)); - - prevLoc = location; - - printProgress(locus.getLocation()); + locusDataProvider.close(); } - updateCumulativeMetrics(dataProvider.getShard()); + windowMaker.close(); + + updateCumulativeMetrics(readDataProvider.getShard()); if ( ! profile.isEmpty() ) incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); @@ -113,7 +110,7 @@ public class ExperimentalReadShardTraverseActiveRegions extends TraversalEn if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) { workQueue.removeLast(); activeRegions.remove(first); - workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) ); + workQueue.addLast(new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension)); } } workQueue.addAll( activeRegions ); @@ -121,21 +118,13 @@ public class ExperimentalReadShardTraverseActiveRegions extends TraversalEn logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); - // now go and process all of the active regions - sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig()); + // now process the active regions, where possible + boolean emptyQueue = false; + sum = processActiveRegions(walker, sum, emptyQueue); return sum; } - /** - * Is the loc outside of the intervals being requested for processing by the GATK? - * @param loc - * @return - */ - private boolean outsideEngineIntervals(final GenomeLoc loc) { - return engine.getIntervals() != null && ! engine.getIntervals().overlaps(loc); - } - /** * Take the individual isActive calls and integrate them into contiguous active regions and * add these blocks of work to the work queue @@ -191,12 +180,12 @@ public class ExperimentalReadShardTraverseActiveRegions extends TraversalEn // // -------------------------------------------------------------------------------- - private T processActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) { + private T processActiveRegions( final ActiveRegionWalker walker, T sum, boolean emptyQueue ) { if( walker.activeRegionOutStream != null ) { writeActiveRegionsToStream(walker); return sum; } else { - return callWalkerMapOnActiveRegions(walker, sum, minStart, currentContig); + return callWalkerMapOnActiveRegions(walker, sum, emptyQueue); } } @@ -214,70 +203,99 @@ public class ExperimentalReadShardTraverseActiveRegions extends TraversalEn } } - private T callWalkerMapOnActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) { - // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them + private T callWalkerMapOnActiveRegions( final ActiveRegionWalker walker, T sum, boolean emptyQueue ) { + final int lastRegionStart = workQueue.getLast().getLocation().getStart(); + final String lastRegionContig = workQueue.getLast().getLocation().getContig(); + + // If we've traversed sufficiently past the beginning of the workQueue we can unload those regions and process them // TODO can implement parallel traversal here - while( workQueue.peek() != null ) { - final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc(); - if ( extendedLoc.getStop() < minStart || (currentContig != null && !workQueue.peek().getExtendedLoc().getContig().equals(currentContig))) { - final ActiveRegion activeRegion = workQueue.remove(); - sum = processActiveRegion(activeRegion, sum, walker); - } else { - break; + while( workQueue.peekFirst() != null ) { + ActiveRegion firstRegion = workQueue.getFirst(); + final String firstRegionContig = firstRegion.getLocation().getContig(); + if (emptyQueue || firstRegionContig != lastRegionContig) { + sum = processFirstActiveRegion(sum, walker); + } + else { + final int firstRegionMaxReadStop = walker.wantsExtendedReads() ? firstRegion.getMaxReadStop() : firstRegion.getExtendedMaxReadStop(); + if (lastRegionStart > firstRegionMaxReadStop) { + sum = processFirstActiveRegion( sum, walker ); + } + else { + break; + } } } return sum; } - private T processActiveRegion( final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker walker ) { - final ArrayList placedReads = new ArrayList(); - for( final GATKSAMRecord read : myReads ) { - final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); - if( activeRegion.getLocation().overlapsP( readLoc ) ) { + /** + * Process the first active region and all remaining reads which overlap + * + * Remove the first active region from the queue + * (NB: some reads associated with this active region may have already been processed) + * + * Remove all of these reads from the queue + * (NB: some may be associated with other active regions) + * + * @param sum + * @param walker + * @return + */ + private T processFirstActiveRegion( final T sum, final ActiveRegionWalker walker ) { + final ActiveRegion firstRegion = workQueue.removeFirst(); + + GATKSAMRecord firstRead = myReads.peekFirst(); // don't remove because it may not be placed here + GenomeLoc firstReadLoc = this.engine.getGenomeLocParser().createGenomeLoc( firstRead ); + + while ( firstRegion.getLocation().overlapsP( firstReadLoc ) || + (walker.wantsExtendedReads() && firstRegion.getExtendedLoc().overlapsP( firstReadLoc ))) { + if( firstRegion.getLocation().overlapsP( firstReadLoc ) ) { // The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region) - long maxOverlap = activeRegion.getLocation().sizeOfOverlap( readLoc ); - ActiveRegion bestRegion = activeRegion; + long maxOverlap = firstRegion.getLocation().sizeOfOverlap( firstReadLoc ); + ActiveRegion bestRegion = firstRegion; for( final ActiveRegion otherRegionToTest : workQueue ) { - if( otherRegionToTest.getLocation().sizeOfOverlap(readLoc) >= maxOverlap ) { - maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( readLoc ); + if( otherRegionToTest.getLocation().sizeOfOverlap(firstReadLoc) >= maxOverlap ) { + maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( firstReadLoc ); bestRegion = otherRegionToTest; } } - bestRegion.add( read ); + bestRegion.add( firstRead ); // The read is also added to all other regions in which it overlaps but marked as non-primary if( walker.wantsNonPrimaryReads() ) { - if( !bestRegion.equals(activeRegion) ) { - activeRegion.add( read ); + if( !bestRegion.equals(firstRegion) ) { + firstRegion.add(firstRead); } for( final ActiveRegion otherRegionToTest : workQueue ) { if( !bestRegion.equals(otherRegionToTest) ) { // check for non-primary vs. extended - if ( otherRegionToTest.getLocation().overlapsP( readLoc ) ) { - otherRegionToTest.add( read ); - } else if ( walker.wantsExtendedReads() && otherRegionToTest.getExtendedLoc().overlapsP( readLoc ) ) { - otherRegionToTest.add( read ); + if ( otherRegionToTest.getLocation().overlapsP( firstReadLoc ) ) { + otherRegionToTest.add( firstRead ); + } else if ( walker.wantsExtendedReads() && otherRegionToTest.getExtendedLoc().overlapsP( firstReadLoc ) ) { + otherRegionToTest.add( firstRead ); } } } } - placedReads.add( read ); - // check for non-primary vs. extended - } else if( activeRegion.getLocation().overlapsP( readLoc ) ) { - if ( walker.wantsNonPrimaryReads() ) { - activeRegion.add( read ); - } - } else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) { - activeRegion.add( read ); - } - } - myReads.removeAll( placedReads ); // remove all the reads which have been placed into their active region - // WARNING: This hashset relies on reads being exactly equal when they are placed in the list as when they are removed. So the ActiveRegionWalker can't modify the reads in any way. - logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); - final M x = walker.map(activeRegion, null); - return walker.reduce( x, sum ); + // check for non-primary vs. extended + } else if( firstRegion.getLocation().overlapsP( firstReadLoc ) ) { + if ( walker.wantsNonPrimaryReads() ) { + firstRegion.add( firstRead ); + } + } else if( walker.wantsExtendedReads() && firstRegion.getExtendedLoc().overlapsP( firstReadLoc )) { + firstRegion.add( firstRead ); + } + + myReads.removeFirst(); + firstRead = myReads.peekFirst(); + firstReadLoc = this.engine.getGenomeLocParser().createGenomeLoc( firstRead ); + } + + logger.debug(">> Map call with " + firstRegion.getReads().size() + " " + (firstRegion.isActive ? "active" : "inactive") + " reads @ " + firstRegion.getLocation() + " with full extent: " + firstRegion.getReferenceLoc()); + final M x = walker.map( firstRegion, null ); + return walker.reduce(x, sum); } /** @@ -285,6 +303,7 @@ public class ExperimentalReadShardTraverseActiveRegions extends TraversalEn * Ugly for now but will be cleaned up when we push this functionality more into the engine */ public T endTraversal( final Walker walker, T sum) { - return processActiveRegions((ActiveRegionWalker)walker, sum, Integer.MAX_VALUE, null); + boolean emptyQueue = true; + return processActiveRegions((ActiveRegionWalker)walker, sum, emptyQueue); } } diff --git a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java index 3e3bb220a..d1199ad3d 100644 --- a/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java +++ b/public/java/src/org/broadinstitute/sting/utils/activeregion/ActiveRegion.java @@ -26,14 +26,19 @@ public class ActiveRegion implements HasGenomeLocation { private final GenomeLocParser genomeLocParser; public final boolean isActive; + // maximum stop position of all reads with start position in this active region + // Used only by ExperimentalReadShardTraverseActiveRegions + // NB: these reads may not be associated with this active region! + private int maxReadStop; + public ActiveRegion( final GenomeLoc activeRegionLoc, final boolean isActive, final GenomeLocParser genomeLocParser, final int extension ) { this.activeRegionLoc = activeRegionLoc; this.isActive = isActive; this.genomeLocParser = genomeLocParser; - this.extension = extension; extendedLoc = genomeLocParser.createGenomeLocOnContig(activeRegionLoc.getContig(), activeRegionLoc.getStart() - extension, activeRegionLoc.getStop() + extension); fullExtentReferenceLoc = extendedLoc; + maxReadStop = activeRegionLoc.getStart(); } @Override @@ -94,6 +99,18 @@ public class ActiveRegion implements HasGenomeLocation { public void remove( final GATKSAMRecord read ) { reads.remove( read ); } public void removeAll( final ArrayList readsToRemove ) { reads.removeAll( readsToRemove ); } + public void setMaxReadStop(int maxReadStop) { + this.maxReadStop = maxReadStop; + } + + public int getMaxReadStop() { + return maxReadStop; + } + + public int getExtendedMaxReadStop() { + return maxReadStop + extension; + } + public boolean equalExceptReads(final ActiveRegion other) { if ( activeRegionLoc.compareTo(other.activeRegionLoc) != 0 ) return false; if ( isActive != other.isActive ) return false; diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java index 645ba3f3f..1204e639e 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java @@ -4,6 +4,9 @@ import com.google.java.contract.PreconditionError; import net.sf.samtools.*; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; +import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider; +import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; +import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; import org.broadinstitute.sting.gatk.datasources.reads.*; import org.broadinstitute.sting.gatk.resourcemanagement.ThreadAllocation; import org.broadinstitute.sting.utils.GenomeLocSortedSet; @@ -17,7 +20,6 @@ import org.broadinstitute.sting.BaseTest; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; -import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider; import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.executive.WindowMaker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -97,7 +99,9 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { } } - private final TraverseActiveRegions t = new TraverseActiveRegions(); + private final TraverseActiveRegions traverse = new TraverseActiveRegions(); + private final ExperimentalReadShardTraverseActiveRegions readShardTraverse = new ExperimentalReadShardTraverseActiveRegions(); + private final ExperimentalActiveRegionShardTraverseActiveRegions activeRegionShardTraverse = new ExperimentalActiveRegionShardTraverseActiveRegions(); private IndexedFastaSequenceFile reference; private SAMSequenceDictionary dictionary; @@ -108,6 +112,8 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { private static final String testBAM = "TraverseActiveRegionsUnitTest.bam"; private static final String testBAI = "TraverseActiveRegionsUnitTest.bai"; + private static final ExperimentalActiveRegionShardType shardType = ExperimentalActiveRegionShardType.LOCUSSHARD; + @BeforeClass private void init() throws FileNotFoundException { reference = new CachingIndexedFastaSequenceFile(new File(hg19Reference)); @@ -175,8 +181,8 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { private List getIsActiveIntervals(DummyActiveRegionWalker walker, List intervals) { List activeIntervals = new ArrayList(); - for (LocusShardDataProvider dataProvider : createDataProviders(intervals, testBAM)) { - t.traverse(walker, dataProvider, 0); + for (ShardDataProvider dataProvider : createDataProviders(intervals, testBAM)) { + traverse(walker, dataProvider, 0); activeIntervals.addAll(walker.isActiveCalls); } @@ -413,10 +419,10 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { } private Map getActiveRegions(DummyActiveRegionWalker walker, List intervals) { - for (LocusShardDataProvider dataProvider : createDataProviders(intervals, testBAM)) - t.traverse(walker, dataProvider, 0); + for (ShardDataProvider dataProvider : createDataProviders(intervals, testBAM)) + traverse(walker, dataProvider, 0); - t.endTraversal(walker, 0); + endTraversal(walker, 0); return walker.mappedActiveRegions; } @@ -477,13 +483,12 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { return record; } - private List createDataProviders(List intervals, String bamFile) { + private List createDataProviders(List intervals, String bamFile) { GenomeAnalysisEngine engine = new GenomeAnalysisEngine(); engine.setGenomeLocParser(genomeLocParser); GATKArgumentCollection arguments = new GATKArgumentCollection(); - arguments.activeRegionShardType = ExperimentalActiveRegionShardType.LOCUSSHARD; // make explicit + arguments.activeRegionShardType = shardType; // make explicit engine.setArguments(arguments); - t.initialize(engine); Collection samFiles = new ArrayList(); SAMReaderID readerID = new SAMReaderID(new File(bamFile), new Tags()); @@ -491,13 +496,51 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { SAMDataSource dataSource = new SAMDataSource(samFiles, new ThreadAllocation(), null, genomeLocParser); - List providers = new ArrayList(); - for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new LocusShardBalancer())) { - for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs())) { - providers.add(new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList())); - } + List providers = new ArrayList(); + + switch (shardType) { + case LOCUSSHARD: + traverse.initialize(engine); + for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new LocusShardBalancer())) { + for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs())) { + providers.add(new LocusShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, window.getLocus(), window, reference, new ArrayList())); + } + } + break; + case READSHARD: + readShardTraverse.initialize(engine); + for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new ReadShardBalancer())) { + providers.add(new ReadShardDataProvider(shard, genomeLocParser, shard.iterator(), reference, new ArrayList())); + } + break; + default: // other types not implemented } return providers; } + + private void traverse(DummyActiveRegionWalker walker, ShardDataProvider dataProvider, int i) { + switch (shardType) { + case LOCUSSHARD: + traverse.traverse(walker, (LocusShardDataProvider) dataProvider, i); + break; + case READSHARD: + readShardTraverse.traverse(walker, (ReadShardDataProvider) dataProvider, i); + break; + default: // other types not implemented + } + } + + private void endTraversal(DummyActiveRegionWalker walker, int i) { + switch (shardType) { + case LOCUSSHARD: + traverse.endTraversal(walker, i); + break; + case READSHARD: + readShardTraverse.endTraversal(walker, i); + break; + default: // other types not implemented + } + } + } From 319d651e4a7b5198cd266ca337ebc9750e53fe9a Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Thu, 3 Jan 2013 15:49:40 -0500 Subject: [PATCH 07/18] Initial updates for ActiveRegionShard --- .../sting/gatk/GenomeAnalysisEngine.java | 5 +- .../ActiveRegionShardDataProvider.java | 58 +++++ .../datasources/reads/ActiveRegionShard.java | 41 +++ .../reads/ActiveRegionShardBalancer.java | 32 +++ .../sting/gatk/datasources/reads/Shard.java | 4 +- .../gatk/executive/LinearMicroScheduler.java | 13 + ...ctiveRegionShardTraverseActiveRegions.java | 233 ++++++++++-------- ...imentalReadShardTraverseActiveRegions.java | 2 +- .../traversals/TraverseActiveRegions.java | 2 +- .../TraverseActiveRegionsUnitTest.java | 22 +- 10 files changed, 298 insertions(+), 114 deletions(-) create mode 100644 public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ActiveRegionShardDataProvider.java create mode 100755 public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShard.java create mode 100644 public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShardBalancer.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java index f17450247..bee25dc2f 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisEngine.java @@ -588,7 +588,10 @@ public class GenomeAnalysisEngine { else return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), readShardBalancer); case ACTIVEREGIONSHARD: - throw new UserException.CommandLineException("Not implemented."); + if(intervals == null) + return readsDataSource.createShardIteratorOverMappedReads(referenceDataSource.getReference().getSequenceDictionary(),new ActiveRegionShardBalancer()); + else + return readsDataSource.createShardIteratorOverIntervals(((ActiveRegionWalker)walker).extendIntervals(intervals, this.genomeLocParser, this.getReferenceDataSource().getReference()), new ActiveRegionShardBalancer()); default: throw new UserException.CommandLineException("Invalid active region shard type."); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ActiveRegionShardDataProvider.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ActiveRegionShardDataProvider.java new file mode 100644 index 000000000..55e51f934 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/providers/ActiveRegionShardDataProvider.java @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.providers; + +import net.sf.picard.reference.IndexedFastaSequenceFile; +import org.broadinstitute.sting.gatk.ReadProperties; +import org.broadinstitute.sting.gatk.datasources.reads.Shard; +import org.broadinstitute.sting.gatk.datasources.rmd.ReferenceOrderedDataSource; +import org.broadinstitute.sting.gatk.iterators.LocusIterator; +import org.broadinstitute.sting.gatk.iterators.StingSAMIterator; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; + +import java.util.Collection; + +/** + * @author Joel Thibault + */ +public class ActiveRegionShardDataProvider extends ShardDataProvider { + final private ReadShardDataProvider readProvider; + final private LocusShardDataProvider locusProvider; + + public ActiveRegionShardDataProvider(Shard shard, ReadProperties sourceInfo, GenomeLocParser genomeLocParser, StingSAMIterator reads, GenomeLoc locus, LocusIterator locusIterator, IndexedFastaSequenceFile reference, Collection rods) { + super(shard, genomeLocParser, reference, rods); // TODO: necessary? + readProvider = new ReadShardDataProvider(shard, genomeLocParser, reads, reference, rods); + locusProvider = new LocusShardDataProvider(shard, sourceInfo, genomeLocParser, locus, locusIterator, reference, rods); + } + + public ReadShardDataProvider getReadShardDataProvider() { + return readProvider; + } + + public LocusShardDataProvider getLocusShardDataProvider(LocusIterator iterator) { + return locusProvider; + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShard.java new file mode 100755 index 000000000..381b193e9 --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShard.java @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +import net.sf.samtools.SAMFileSpan; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.GenomeLocParser; + +import java.util.List; +import java.util.Map; + +/** + * @author Joel Thibault + */ +public class ActiveRegionShard extends ReadShard { + public ActiveRegionShard(GenomeLocParser parser, SAMDataSource readsDataSource, Map fileSpans, List loci, boolean isUnmapped) { + super(parser, readsDataSource, fileSpans, loci, isUnmapped); + } +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShardBalancer.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShardBalancer.java new file mode 100644 index 000000000..338dd1bdf --- /dev/null +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/ActiveRegionShardBalancer.java @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2012, The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.datasources.reads; + +/** + * @author Joel Thibault + */ +public class ActiveRegionShardBalancer extends ReadShardBalancer { + // TODO ? +} diff --git a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java index e22a7a54d..314156af6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java +++ b/public/java/src/org/broadinstitute/sting/gatk/datasources/reads/Shard.java @@ -40,7 +40,9 @@ import java.util.Map; */ public abstract class Shard implements HasGenomeLocation { public enum ShardType { - READ, LOCUS + READ, + LOCUS, + ACTIVEREGION // Used only by ExperimentalActiveRegionShardTraverseActiveRegions } protected final GenomeLocParser parser; // incredibly annoying! diff --git a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java index 84d975879..44f9978a6 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java +++ b/public/java/src/org/broadinstitute/sting/gatk/executive/LinearMicroScheduler.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.executive; import net.sf.picard.reference.IndexedFastaSequenceFile; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; +import org.broadinstitute.sting.gatk.datasources.providers.ActiveRegionShardDataProvider; import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider; import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; @@ -80,6 +81,18 @@ public class LinearMicroScheduler extends MicroScheduler { } windowMaker.close(); } + else if(shard.getShardType() == Shard.ShardType.ACTIVEREGION) { + WindowMaker windowMaker = new WindowMaker(shard, engine.getGenomeLocParser(), + getReadIterator(shard), shard.getGenomeLocs(), SampleUtils.getSAMFileSamples(engine)); + for(WindowMaker.WindowMakerIterator iterator: windowMaker) { + ShardDataProvider dataProvider = new ActiveRegionShardDataProvider(shard,iterator.getSourceInfo(),engine.getGenomeLocParser(),getReadIterator(shard),iterator.getLocus(),iterator,reference,rods); + Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit()); + accumulator.accumulate(dataProvider,result); + dataProvider.close(); + if ( walker.isDone() ) break; + } + windowMaker.close(); + } else { ShardDataProvider dataProvider = new ReadShardDataProvider(shard,engine.getGenomeLocParser(),getReadIterator(shard),reference,rods); Object result = traversalEngine.traverse(walker, dataProvider, accumulator.getReduceInit()); diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalActiveRegionShardTraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalActiveRegionShardTraverseActiveRegions.java index 71cb89ad9..45d132678 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalActiveRegionShardTraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalActiveRegionShardTraverseActiveRegions.java @@ -1,32 +1,35 @@ package org.broadinstitute.sting.gatk.traversals; +import net.sf.samtools.SAMFileHeader; import org.apache.log4j.Logger; import org.broadinstitute.sting.gatk.WalkerManager; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.datasources.providers.*; +import org.broadinstitute.sting.gatk.datasources.reads.Shard; +import org.broadinstitute.sting.gatk.executive.WindowMaker; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.ActiveRegionExtension; import org.broadinstitute.sting.gatk.walkers.ActiveRegionWalker; import org.broadinstitute.sting.gatk.walkers.DataSource; import org.broadinstitute.sting.gatk.walkers.Walker; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.SampleUtils; import org.broadinstitute.sting.utils.activeregion.ActiveRegion; import org.broadinstitute.sting.utils.activeregion.ActivityProfile; import org.broadinstitute.sting.utils.activeregion.ActivityProfileResult; -import org.broadinstitute.sting.utils.pileup.PileupElement; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.util.*; -public class ExperimentalActiveRegionShardTraverseActiveRegions extends TraversalEngine,LocusShardDataProvider> { +public class ExperimentalActiveRegionShardTraverseActiveRegions extends TraversalEngine,ActiveRegionShardDataProvider> { /** * our log, which we want to capture anything from this class */ protected final static Logger logger = Logger.getLogger(TraversalEngine.class); private final LinkedList workQueue = new LinkedList(); - private final LinkedHashSet myReads = new LinkedHashSet(); + private final LinkedList myReads = new LinkedList(); @Override public String getTraversalUnits() { @@ -35,71 +38,65 @@ public class ExperimentalActiveRegionShardTraverseActiveRegions extends Tra @Override public T traverse( final ActiveRegionWalker walker, - final LocusShardDataProvider dataProvider, + final ActiveRegionShardDataProvider dataProvider, T sum) { - logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider)); + logger.debug(String.format("ExperimentalActiveRegionShardTraverseActiveRegions.traverse: Shard is %s", dataProvider)); - final LocusView locusView = new AllLocusView(dataProvider); + ReadShardDataProvider readDataProvider = dataProvider.getReadShardDataProvider(); - final LocusReferenceView referenceView = new LocusReferenceView( walker, dataProvider ); final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension(); final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion(); - int minStart = Integer.MAX_VALUE; + final ReadView readView = new ReadView(readDataProvider); + final List activeRegions = new LinkedList(); - ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions() ); + ActivityProfile profile = new ActivityProfile(engine.getGenomeLocParser(), walker.hasPresetActiveRegions()); - ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, dataProvider, locusView); + Shard readShard = readDataProvider.getShard(); + SAMFileHeader header = readShard.getReadProperties().getHeader(); + WindowMaker windowMaker = new WindowMaker(readShard, engine.getGenomeLocParser(), + readView.iterator(), readShard.getGenomeLocs(), SampleUtils.getSAMFileSamples(header)); - // We keep processing while the next reference location is within the interval - GenomeLoc prevLoc = null; - while( locusView.hasNext() ) { - final AlignmentContext locus = locusView.next(); - final GenomeLoc location = locus.getLocation(); + for(WindowMaker.WindowMakerIterator iterator: windowMaker) { + LocusShardDataProvider locusDataProvider = dataProvider.getLocusShardDataProvider(iterator); + final LocusView locusView = new AllLocusView(locusDataProvider); + final LocusReferenceView referenceView = new LocusReferenceView( walker, locusDataProvider ); + ReferenceOrderedView referenceOrderedDataView = getReferenceOrderedView(walker, locusDataProvider, locusView); - // Grab all the previously unseen reads from this pileup and add them to the massive read list - // Note that this must occur before we leave because we are outside the intervals because - // reads may occur outside our intervals but overlap them in the future - // TODO -- this whole HashSet logic should be changed to a linked list of reads with - // TODO -- subsequent pass over them to find the ones overlapping the active regions - for( final PileupElement p : locus.getBasePileup() ) { - final GATKSAMRecord read = p.getRead(); - if( !myReads.contains(read) ) { - myReads.add(read); + // We keep processing while the next reference location is within the interval + GenomeLoc prevLoc = null; + while( locusView.hasNext() ) { + final AlignmentContext locus = locusView.next(); + final GenomeLoc location = locus.getLocation(); + + if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) { + // we've move across some interval boundary, restart profile + profile = incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); } - // If this is the last pileup for this shard calculate the minimum alignment start so that we know - // which active regions in the work queue are now safe to process - minStart = Math.min(minStart, read.getAlignmentStart()); + readDataProvider.getShard().getReadMetrics().incrementNumIterations(); + + // create reference context. Note that if we have a pileup of "extended events", the context will + // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). + final ReferenceContext refContext = referenceView.getReferenceContext(location); + + // Iterate forward to get all reference ordered data covering this location + final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); + + // Call the walkers isActive function for this locus and add them to the list to be integrated later + profile.add(walkerActiveProb(walker, tracker, refContext, locus, location)); + + prevLoc = location; + + printProgress(locus.getLocation()); } - // skip this location -- it's not part of our engine intervals - if ( outsideEngineIntervals(location) ) - continue; - - if ( prevLoc != null && location.getStart() != prevLoc.getStop() + 1 ) { - // we've move across some interval boundary, restart profile - profile = incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); - } - - dataProvider.getShard().getReadMetrics().incrementNumIterations(); - - // create reference context. Note that if we have a pileup of "extended events", the context will - // hold the (longest) stretch of deleted reference bases (if deletions are present in the pileup). - final ReferenceContext refContext = referenceView.getReferenceContext(location); - - // Iterate forward to get all reference ordered data covering this location - final RefMetaDataTracker tracker = referenceOrderedDataView.getReferenceOrderedDataAtLocus(locus.getLocation(), refContext); - - // Call the walkers isActive function for this locus and add them to the list to be integrated later - profile.add(walkerActiveProb(walker, tracker, refContext, locus, location)); - - prevLoc = location; - - printProgress(locus.getLocation()); + locusDataProvider.close(); } - updateCumulativeMetrics(dataProvider.getShard()); + windowMaker.close(); + + updateCumulativeMetrics(readDataProvider.getShard()); if ( ! profile.isEmpty() ) incorporateActiveRegions(profile, activeRegions, activeRegionExtension, maxRegionSize); @@ -113,7 +110,7 @@ public class ExperimentalActiveRegionShardTraverseActiveRegions extends Tra if( last.isActive == first.isActive && last.getLocation().contiguousP(first.getLocation()) && last.getLocation().size() + first.getLocation().size() <= maxRegionSize ) { workQueue.removeLast(); activeRegions.remove(first); - workQueue.add( new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension) ); + workQueue.addLast(new ActiveRegion(last.getLocation().union(first.getLocation()), first.isActive, this.engine.getGenomeLocParser(), activeRegionExtension)); } } workQueue.addAll( activeRegions ); @@ -121,21 +118,13 @@ public class ExperimentalActiveRegionShardTraverseActiveRegions extends Tra logger.debug("Integrated " + profile.size() + " isActive calls into " + activeRegions.size() + " regions." ); - // now go and process all of the active regions - sum = processActiveRegions(walker, sum, minStart, dataProvider.getLocus().getContig()); + // now process the active regions, where possible + boolean emptyQueue = false; + sum = processActiveRegions(walker, sum, emptyQueue); return sum; } - /** - * Is the loc outside of the intervals being requested for processing by the GATK? - * @param loc - * @return - */ - private boolean outsideEngineIntervals(final GenomeLoc loc) { - return engine.getIntervals() != null && ! engine.getIntervals().overlaps(loc); - } - /** * Take the individual isActive calls and integrate them into contiguous active regions and * add these blocks of work to the work queue @@ -191,12 +180,12 @@ public class ExperimentalActiveRegionShardTraverseActiveRegions extends Tra // // -------------------------------------------------------------------------------- - private T processActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) { + private T processActiveRegions( final ActiveRegionWalker walker, T sum, boolean emptyQueue ) { if( walker.activeRegionOutStream != null ) { writeActiveRegionsToStream(walker); return sum; } else { - return callWalkerMapOnActiveRegions(walker, sum, minStart, currentContig); + return callWalkerMapOnActiveRegions(walker, sum, emptyQueue); } } @@ -214,70 +203,99 @@ public class ExperimentalActiveRegionShardTraverseActiveRegions extends Tra } } - private T callWalkerMapOnActiveRegions( final ActiveRegionWalker walker, T sum, final int minStart, final String currentContig ) { - // Since we've traversed sufficiently past this point (or this contig!) in the workQueue we can unload those regions and process them + private T callWalkerMapOnActiveRegions( final ActiveRegionWalker walker, T sum, boolean emptyQueue ) { + final int lastRegionStart = workQueue.getLast().getLocation().getStart(); + final String lastRegionContig = workQueue.getLast().getLocation().getContig(); + + // If we've traversed sufficiently past the beginning of the workQueue we can unload those regions and process them // TODO can implement parallel traversal here - while( workQueue.peek() != null ) { - final GenomeLoc extendedLoc = workQueue.peek().getExtendedLoc(); - if ( extendedLoc.getStop() < minStart || (currentContig != null && !workQueue.peek().getExtendedLoc().getContig().equals(currentContig))) { - final ActiveRegion activeRegion = workQueue.remove(); - sum = processActiveRegion(activeRegion, sum, walker); - } else { - break; + while( workQueue.peekFirst() != null ) { + ActiveRegion firstRegion = workQueue.getFirst(); + final String firstRegionContig = firstRegion.getLocation().getContig(); + if (emptyQueue || firstRegionContig != lastRegionContig) { + sum = processFirstActiveRegion(sum, walker); + } + else { + final int firstRegionMaxReadStop = walker.wantsExtendedReads() ? firstRegion.getMaxReadStop() : firstRegion.getExtendedMaxReadStop(); + if (lastRegionStart > firstRegionMaxReadStop) { + sum = processFirstActiveRegion( sum, walker ); + } + else { + break; + } } } return sum; } - private T processActiveRegion( final ActiveRegion activeRegion, final T sum, final ActiveRegionWalker walker ) { - final ArrayList placedReads = new ArrayList(); - for( final GATKSAMRecord read : myReads ) { - final GenomeLoc readLoc = this.engine.getGenomeLocParser().createGenomeLoc( read ); - if( activeRegion.getLocation().overlapsP( readLoc ) ) { + /** + * Process the first active region and all remaining reads which overlap + * + * Remove the first active region from the queue + * (NB: some reads associated with this active region may have already been processed) + * + * Remove all of these reads from the queue + * (NB: some may be associated with other active regions) + * + * @param sum + * @param walker + * @return + */ + private T processFirstActiveRegion( final T sum, final ActiveRegionWalker walker ) { + final ActiveRegion firstRegion = workQueue.removeFirst(); + + GATKSAMRecord firstRead = myReads.peekFirst(); // don't remove because it may not be placed here + GenomeLoc firstReadLoc = this.engine.getGenomeLocParser().createGenomeLoc( firstRead ); + + while ( firstRegion.getLocation().overlapsP( firstReadLoc ) || + (walker.wantsExtendedReads() && firstRegion.getExtendedLoc().overlapsP( firstReadLoc ))) { + if( firstRegion.getLocation().overlapsP( firstReadLoc ) ) { // The region which the highest amount of overlap is chosen as the primary region for the read (tie breaking is done as right most region) - long maxOverlap = activeRegion.getLocation().sizeOfOverlap( readLoc ); - ActiveRegion bestRegion = activeRegion; + long maxOverlap = firstRegion.getLocation().sizeOfOverlap( firstReadLoc ); + ActiveRegion bestRegion = firstRegion; for( final ActiveRegion otherRegionToTest : workQueue ) { - if( otherRegionToTest.getLocation().sizeOfOverlap(readLoc) >= maxOverlap ) { - maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( readLoc ); + if( otherRegionToTest.getLocation().sizeOfOverlap(firstReadLoc) >= maxOverlap ) { + maxOverlap = otherRegionToTest.getLocation().sizeOfOverlap( firstReadLoc ); bestRegion = otherRegionToTest; } } - bestRegion.add( read ); + bestRegion.add( firstRead ); // The read is also added to all other regions in which it overlaps but marked as non-primary if( walker.wantsNonPrimaryReads() ) { - if( !bestRegion.equals(activeRegion) ) { - activeRegion.add( read ); + if( !bestRegion.equals(firstRegion) ) { + firstRegion.add(firstRead); } for( final ActiveRegion otherRegionToTest : workQueue ) { if( !bestRegion.equals(otherRegionToTest) ) { // check for non-primary vs. extended - if ( otherRegionToTest.getLocation().overlapsP( readLoc ) ) { - otherRegionToTest.add( read ); - } else if ( walker.wantsExtendedReads() && otherRegionToTest.getExtendedLoc().overlapsP( readLoc ) ) { - otherRegionToTest.add( read ); + if ( otherRegionToTest.getLocation().overlapsP( firstReadLoc ) ) { + otherRegionToTest.add( firstRead ); + } else if ( walker.wantsExtendedReads() && otherRegionToTest.getExtendedLoc().overlapsP( firstReadLoc ) ) { + otherRegionToTest.add( firstRead ); } } } } - placedReads.add( read ); - // check for non-primary vs. extended - } else if( activeRegion.getLocation().overlapsP( readLoc ) ) { - if ( walker.wantsNonPrimaryReads() ) { - activeRegion.add( read ); - } - } else if( walker.wantsExtendedReads() && activeRegion.getExtendedLoc().overlapsP( readLoc )) { - activeRegion.add( read ); - } - } - myReads.removeAll( placedReads ); // remove all the reads which have been placed into their active region - // WARNING: This hashset relies on reads being exactly equal when they are placed in the list as when they are removed. So the ActiveRegionWalker can't modify the reads in any way. - logger.debug(">> Map call with " + activeRegion.getReads().size() + " " + (activeRegion.isActive ? "active" : "inactive") + " reads @ " + activeRegion.getLocation() + " with full extent: " + activeRegion.getReferenceLoc()); - final M x = walker.map(activeRegion, null); - return walker.reduce( x, sum ); + // check for non-primary vs. extended + } else if( firstRegion.getLocation().overlapsP( firstReadLoc ) ) { + if ( walker.wantsNonPrimaryReads() ) { + firstRegion.add( firstRead ); + } + } else if( walker.wantsExtendedReads() && firstRegion.getExtendedLoc().overlapsP( firstReadLoc )) { + firstRegion.add( firstRead ); + } + + myReads.removeFirst(); + firstRead = myReads.peekFirst(); + firstReadLoc = this.engine.getGenomeLocParser().createGenomeLoc( firstRead ); + } + + logger.debug(">> Map call with " + firstRegion.getReads().size() + " " + (firstRegion.isActive ? "active" : "inactive") + " reads @ " + firstRegion.getLocation() + " with full extent: " + firstRegion.getReferenceLoc()); + final M x = walker.map( firstRegion, null ); + return walker.reduce(x, sum); } /** @@ -285,6 +303,7 @@ public class ExperimentalActiveRegionShardTraverseActiveRegions extends Tra * Ugly for now but will be cleaned up when we push this functionality more into the engine */ public T endTraversal( final Walker walker, T sum) { - return processActiveRegions((ActiveRegionWalker)walker, sum, Integer.MAX_VALUE, null); + boolean emptyQueue = true; + return processActiveRegions((ActiveRegionWalker)walker, sum, emptyQueue); } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalReadShardTraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalReadShardTraverseActiveRegions.java index 672d37f7f..299ee4f56 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalReadShardTraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/ExperimentalReadShardTraverseActiveRegions.java @@ -40,7 +40,7 @@ public class ExperimentalReadShardTraverseActiveRegions extends TraversalE public T traverse( final ActiveRegionWalker walker, final ReadShardDataProvider readDataProvider, T sum) { - logger.debug(String.format("TraverseActiveRegion.traverse: Read Shard is %s", readDataProvider)); + logger.debug(String.format("ExperimentalReadShardTraverseActiveRegions.traverse: Read Shard is %s", readDataProvider)); final int activeRegionExtension = walker.getClass().getAnnotation(ActiveRegionExtension.class).extension(); final int maxRegionSize = walker.getClass().getAnnotation(ActiveRegionExtension.class).maxRegion(); diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java index 33323ba67..2562fcccd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegions.java @@ -43,7 +43,7 @@ public class TraverseActiveRegions extends TraversalEngine walker, final LocusShardDataProvider dataProvider, T sum) { - logger.debug(String.format("TraverseActiveRegion.traverse: Shard is %s", dataProvider)); + logger.debug(String.format("TraverseActiveRegions.traverse: Shard is %s", dataProvider)); final LocusView locusView = new AllLocusView(dataProvider); diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java index 1204e639e..5051bc35f 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java @@ -4,6 +4,7 @@ import com.google.java.contract.PreconditionError; import net.sf.samtools.*; import org.broadinstitute.sting.commandline.Tags; import org.broadinstitute.sting.gatk.arguments.GATKArgumentCollection; +import org.broadinstitute.sting.gatk.datasources.providers.ActiveRegionShardDataProvider; import org.broadinstitute.sting.gatk.datasources.providers.LocusShardDataProvider; import org.broadinstitute.sting.gatk.datasources.providers.ReadShardDataProvider; import org.broadinstitute.sting.gatk.datasources.providers.ShardDataProvider; @@ -32,6 +33,7 @@ import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; import org.broadinstitute.sting.utils.sam.ReadUtils; import org.testng.Assert; +import org.testng.TestException; import org.testng.annotations.BeforeClass; import org.testng.annotations.Test; @@ -513,7 +515,15 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { providers.add(new ReadShardDataProvider(shard, genomeLocParser, shard.iterator(), reference, new ArrayList())); } break; - default: // other types not implemented + case ACTIVEREGIONSHARD: + activeRegionShardTraverse.initialize(engine); + for (Shard shard : dataSource.createShardIteratorOverIntervals(new GenomeLocSortedSet(genomeLocParser, intervals), new ActiveRegionShardBalancer())) { + for (WindowMaker.WindowMakerIterator window : new WindowMaker(shard, genomeLocParser, dataSource.seek(shard), shard.getGenomeLocs())) { + providers.add(new ActiveRegionShardDataProvider(shard, shard.getReadProperties(), genomeLocParser, shard.iterator(), window.getLocus(), window, reference, new ArrayList())); + } + } + break; + default: throw new TestException("Invalid shard type"); } return providers; @@ -527,7 +537,10 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { case READSHARD: readShardTraverse.traverse(walker, (ReadShardDataProvider) dataProvider, i); break; - default: // other types not implemented + case ACTIVEREGIONSHARD: + activeRegionShardTraverse.traverse(walker, (ActiveRegionShardDataProvider) dataProvider, i); + break; + default: throw new TestException("Invalid shard type"); } } @@ -539,7 +552,10 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { case READSHARD: readShardTraverse.endTraversal(walker, i); break; - default: // other types not implemented + case ACTIVEREGIONSHARD: + activeRegionShardTraverse.endTraversal(walker, i); + break; + default: throw new TestException("Invalid shard type"); } } From fbee4c11f1bca8e530e892ae02b29dfd7e978367 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 2 Jan 2013 15:20:25 -0500 Subject: [PATCH 09/18] Unit tests for ProgressMeterData --- .../ProgressMeterDataUnitTest.java | 86 +++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDataUnitTest.java diff --git a/public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDataUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDataUnitTest.java new file mode 100644 index 000000000..d6ea3b227 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDataUnitTest.java @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.progressmeter; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.AutoFormattingTime; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.TimeUnit; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * UnitTests for the ProgressMeterData + * + * User: depristo + * Date: 8/24/12 + * Time: 11:25 AM + * To change this template use File | Settings | File Templates. + */ +public class ProgressMeterDataUnitTest extends BaseTest { + @Test + public void testBasic() { + Assert.assertEquals(new ProgressMeterData(1.0, 2, 3).getElapsedSeconds(), 1.0); + Assert.assertEquals(new ProgressMeterData(1.0, 2, 3).getUnitsProcessed(), 2); + Assert.assertEquals(new ProgressMeterData(1.0, 2, 3).getBpProcessed(), 3); + } + + @Test + public void testFraction() { + final double TOL = 1e-3; + Assert.assertEquals(new ProgressMeterData(1.0, 1, 1).calculateFractionGenomeTargetCompleted(10), 0.1, TOL); + Assert.assertEquals(new ProgressMeterData(1.0, 1, 2).calculateFractionGenomeTargetCompleted(10), 0.2, TOL); + Assert.assertEquals(new ProgressMeterData(1.0, 1, 1).calculateFractionGenomeTargetCompleted(100), 0.01, TOL); + Assert.assertEquals(new ProgressMeterData(1.0, 1, 2).calculateFractionGenomeTargetCompleted(100), 0.02, TOL); + Assert.assertEquals(new ProgressMeterData(1.0, 1, 1).calculateFractionGenomeTargetCompleted(0), 1.0, TOL); + } + + @Test + public void testSecondsPerBP() { + final double TOL = 1e-3; + final long M = 1000000; + Assert.assertEquals(new ProgressMeterData(1.0, 1, M).secondsPerMillionBP(), 1.0, TOL); + Assert.assertEquals(new ProgressMeterData(1.0, 1, M/10).secondsPerMillionBP(), 10.0, TOL); + Assert.assertEquals(new ProgressMeterData(2.0, 1, M).secondsPerMillionBP(), 2.0, TOL); + Assert.assertEquals(new ProgressMeterData(1.0, 1, 0).secondsPerMillionBP(), 1e6, TOL); + } + + @Test + public void testSecondsPerElement() { + final double TOL = 1e-3; + final long M = 1000000; + Assert.assertEquals(new ProgressMeterData(1.0, M, 1).secondsPerMillionElements(), 1.0, TOL); + Assert.assertEquals(new ProgressMeterData(1.0, M/10, 1).secondsPerMillionElements(), 10.0, TOL); + Assert.assertEquals(new ProgressMeterData(2.00, M, 1).secondsPerMillionElements(), 2.0, TOL); + Assert.assertEquals(new ProgressMeterData(1.0, 0, 1).secondsPerMillionElements(), 1e6, TOL); + } +} From 1ba8d47a81c94561feabef7740e2638b670a88ae Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Wed, 2 Jan 2013 16:12:08 -0500 Subject: [PATCH 10/18] Unit tests for ProgressMeterDaemon --- .../utils/progressmeter/ProgressMeter.java | 13 ++- .../progressmeter/ProgressMeterDaemon.java | 31 +++++- .../ProgressMeterDaemonUnitTest.java | 102 ++++++++++++++++++ 3 files changed, 142 insertions(+), 4 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java index 161335957..c9d849227 100755 --- a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java +++ b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java @@ -160,6 +160,13 @@ public class ProgressMeter { public ProgressMeter(final File performanceLogFile, final String processingUnitName, final GenomeLocSortedSet processingIntervals) { + this(performanceLogFile, processingUnitName, processingIntervals, ProgressMeterDaemon.DEFAULT_POLL_FREQUENCY_MILLISECONDS); + } + + protected ProgressMeter(final File performanceLogFile, + final String processingUnitName, + final GenomeLocSortedSet processingIntervals, + final long pollingFrequency) { if ( processingUnitName == null ) throw new IllegalArgumentException("processingUnitName cannot be null"); if ( processingIntervals == null ) throw new IllegalArgumentException("Target intervals cannot be null"); @@ -184,10 +191,14 @@ public class ProgressMeter { targetSizeInBP = processingIntervals.coveredSize(); // start up the timer - progressMeterDaemon = new ProgressMeterDaemon(this); + progressMeterDaemon = new ProgressMeterDaemon(this, pollingFrequency); start(); } + public ProgressMeterDaemon getProgressMeterDaemon() { + return progressMeterDaemon; + } + /** * Start up the progress meter, printing initialization message and starting up the * daemon thread for periodic printing. diff --git a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemon.java b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemon.java index 16887400a..7edd8e724 100644 --- a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemon.java +++ b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemon.java @@ -8,10 +8,20 @@ package org.broadinstitute.sting.utils.progressmeter; * Time: 9:16 PM */ public final class ProgressMeterDaemon extends Thread { + public final static long DEFAULT_POLL_FREQUENCY_MILLISECONDS = 10 * 1000; + /** * How frequently should we poll and print progress? */ - private final static long POLL_FREQUENCY_MILLISECONDS = 10 * 1000; + private final long pollFrequencyMilliseconds; + + /** + * How long are we waiting between print progress calls are issued? + * @return the time in milliseconds between progress meter calls + */ + private long getPollFrequencyMilliseconds() { + return pollFrequencyMilliseconds; + } /** * Are we to continue periodically printing status, or should we shut down? @@ -27,13 +37,20 @@ public final class ProgressMeterDaemon extends Thread { * Create a new ProgressMeterDaemon printing progress for meter * @param meter the progress meter to print progress of */ - public ProgressMeterDaemon(final ProgressMeter meter) { + public ProgressMeterDaemon(final ProgressMeter meter, final long pollFrequencyMilliseconds) { if ( meter == null ) throw new IllegalArgumentException("meter cannot be null"); + if ( pollFrequencyMilliseconds <= 0 ) throw new IllegalArgumentException("pollFrequencyMilliseconds must be greater than 0 but got " + pollFrequencyMilliseconds); + this.meter = meter; + this.pollFrequencyMilliseconds = pollFrequencyMilliseconds; setDaemon(true); setName("ProgressMeterDaemon"); } + public ProgressMeterDaemon(final ProgressMeter meter) { + this(meter, DEFAULT_POLL_FREQUENCY_MILLISECONDS); + } + /** * Tells this daemon thread to shutdown at the next opportunity, as the progress * metering is complete. @@ -42,6 +59,14 @@ public final class ProgressMeterDaemon extends Thread { this.done = true; } + /** + * Is this daemon thread done? + * @return true if done, false otherwise + */ + public boolean isDone() { + return done; + } + /** * Start up the ProgressMeterDaemon, polling every tens of seconds to print, if * necessary, the provided progress meter. Never exits until the JVM is complete, @@ -51,7 +76,7 @@ public final class ProgressMeterDaemon extends Thread { while (! done) { meter.printProgress(false); try { - Thread.sleep(POLL_FREQUENCY_MILLISECONDS); + Thread.sleep(getPollFrequencyMilliseconds()); } catch (InterruptedException e) { throw new RuntimeException(e); } diff --git a/public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java new file mode 100644 index 000000000..420db683e --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/progressmeter/ProgressMeterDaemonUnitTest.java @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.progressmeter; + +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.GenomeLocSortedSet; +import org.broadinstitute.sting.utils.fasta.CachingIndexedFastaSequenceFile; +import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; + +/** + * UnitTests for the ProgressMeterDaemon + * + * User: depristo + * Date: 8/24/12 + * Time: 11:25 AM + * To change this template use File | Settings | File Templates. + */ +public class ProgressMeterDaemonUnitTest extends BaseTest { + private GenomeLocParser genomeLocParser; + + @BeforeClass + public void init() throws FileNotFoundException { + genomeLocParser = new GenomeLocParser(new CachingIndexedFastaSequenceFile(new File(b37KGReference))); + } + + // capture and count calls to progress + private class TestingProgressMeter extends ProgressMeter { + final List progressCalls = new LinkedList(); + + private TestingProgressMeter(final long poll) { + super(null, "test", new GenomeLocSortedSet(genomeLocParser), poll); + } + + @Override + protected synchronized void printProgress(boolean mustPrint) { + progressCalls.add(System.currentTimeMillis()); + } + } + + @DataProvider(name = "PollingData") + public Object[][] makePollingData() { + List tests = new ArrayList(); + for ( final int ticks : Arrays.asList(1, 5, 10) ) { + for ( final int poll : Arrays.asList(10, 100) ) { + tests.add(new Object[]{poll, ticks}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "PollingData", invocationCount = 10, successPercentage = 90) + public void testMe(final long poll, final int ticks) throws InterruptedException { + final TestingProgressMeter meter = new TestingProgressMeter(poll); + final ProgressMeterDaemon daemon = meter.getProgressMeterDaemon(); + Assert.assertTrue(daemon.isDaemon()); + + Assert.assertFalse(daemon.isDone()); + Thread.sleep(ticks * poll); + Assert.assertFalse(daemon.isDone()); + + daemon.done(); + Assert.assertTrue(daemon.isDone()); + + Assert.assertEquals(meter.progressCalls.size(), ticks, + "Expected " + ticks + " progress calls from daemon thread, but only got " + meter.progressCalls.size() + " with exact calls " + meter.progressCalls); + } +} From 7df47418d839182cabd90839cea94da400d1d8eb Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 3 Jan 2013 14:43:36 -0500 Subject: [PATCH 11/18] BQSR optimization: make RecalibrationTables thread-local, and merge results in onTraversalDone -- With the newer, faster BQSR, scaling was limited by the NestedIntegerArray. The solution to this is to make the entire table thread-local, so that each nct thread has its own data and doesn't have any collisions. -- Removed the previous partial solution of having a thread-local quality score table -- Added a new argument -lowMemory --- .../bqsr/AdvancedRecalibrationEngine.java | 42 +------- .../gatk/walkers/bqsr/BaseRecalibrator.java | 31 +++--- .../walkers/bqsr/RecalibrationEngine.java | 11 ++- .../bqsr/StandardRecalibrationEngine.java | 99 ++++++++++++++++--- .../recalibration/RecalibrationTables.java | 16 +-- 5 files changed, 130 insertions(+), 69 deletions(-) diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java index 255f1fd05..3871101eb 100644 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java +++ b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java @@ -25,35 +25,21 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; * OTHER DEALINGS IN THE SOFTWARE. */ -import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.classloader.ProtectedPackageSource; import org.broadinstitute.sting.utils.collections.NestedIntegerArray; import org.broadinstitute.sting.utils.recalibration.EventType; import org.broadinstitute.sting.utils.recalibration.ReadCovariates; import org.broadinstitute.sting.utils.recalibration.RecalDatum; +import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; -import java.util.LinkedList; -import java.util.List; - public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine implements ProtectedPackageSource { - private final static Logger logger = Logger.getLogger(AdvancedRecalibrationEngine.class); - - final List> allThreadLocalQualityScoreTables = new LinkedList>(); - private ThreadLocal> threadLocalQualityScoreTables = new ThreadLocal>() { - @Override - protected synchronized NestedIntegerArray initialValue() { - final NestedIntegerArray table = recalibrationTables.makeQualityScoreTable(); - allThreadLocalQualityScoreTables.add(table); - return table; - } - }; - @Override public void updateDataForRead( final ReadRecalibrationInfo recalInfo ) { final GATKSAMRecord read = recalInfo.getRead(); final ReadCovariates readCovariates = recalInfo.getCovariatesValues(); - final NestedIntegerArray qualityScoreTable = getThreadLocalQualityScoreTable(); + final RecalibrationTables tables = getRecalibrationTables(); + final NestedIntegerArray qualityScoreTable = tables.getQualityScoreTable(); for( int offset = 0; offset < read.getReadBases().length; offset++ ) { if( ! recalInfo.skip(offset) ) { @@ -70,30 +56,10 @@ public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine imp if (keys[i] < 0) continue; - incrementDatumOrPutIfNecessary(recalibrationTables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex); + incrementDatumOrPutIfNecessary(tables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex); } } } } } - - /** - * Get a NestedIntegerArray for a QualityScore table specific to this thread - * @return a non-null NestedIntegerArray ready to be used to collect calibration info for the quality score covariate - */ - private NestedIntegerArray getThreadLocalQualityScoreTable() { - return threadLocalQualityScoreTables.get(); - } - - @Override - public void finalizeData() { - // merge in all of the thread local tables - logger.info("Merging " + allThreadLocalQualityScoreTables.size() + " thread-local quality score tables"); - for ( final NestedIntegerArray localTable : allThreadLocalQualityScoreTables ) { - recalibrationTables.combineQualityScoreTable(localTable); - } - allThreadLocalQualityScoreTables.clear(); // cleanup after ourselves - - super.finalizeData(); - } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index 7692c58e2..ffcfd6233 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -120,9 +120,16 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche @Argument(fullName = "bqsrBAQGapOpenPenalty", shortName="bqsrBAQGOP", doc="BQSR BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false) public double BAQGOP = BAQ.DEFAULT_GOP; - private QuantizationInfo quantizationInfo; // an object that keeps track of the information necessary for quality score quantization + /** + * When you have nct > 1, BQSR uses nct times more memory to compute its recalibration tables, for efficiency + * purposes. If you have many covariates, and therefore are using a lot of memory, you can use this flag + * to safely access only one table. There may be some CPU cost, but as long as the table is really big + * there should be relatively little CPU costs. + */ + @Argument(fullName = "lowMemoryMode", shortName="lowMemoryMode", doc="Reduce memory usage in multi-threaded code at the expense of threading efficiency", required = false) + public boolean lowMemoryMode = false; - private RecalibrationTables recalibrationTables; + private QuantizationInfo quantizationInfo; // an object that keeps track of the information necessary for quality score quantization private Covariate[] requestedCovariates; // list to hold the all the covariate objects that were requested (required + standard + experimental) @@ -130,8 +137,6 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche private int minimumQToUse; - protected static final String COVARS_ATTRIBUTE = "COVARS"; // used to store covariates array as a temporary attribute inside GATKSAMRecord.\ - private static final String NO_DBSNP_EXCEPTION = "This calculation is critically dependent on being able to skip over known variant sites. Please provide a VCF file containing known sites of genetic variation."; private BAQ baq; // BAQ the reads on the fly to generate the alignment uncertainty vector @@ -143,7 +148,6 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche * Based on the covariates' estimates for initial capacity allocate the data hashmap */ public void initialize() { - baq = new BAQ(BAQGOP); // setup the BAQ object with the provided gap open penalty // check for unsupported access @@ -188,10 +192,11 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche int numReadGroups = 0; for ( final SAMFileHeader header : getToolkit().getSAMFileHeaders() ) numReadGroups += header.getReadGroups().size(); - recalibrationTables = new RecalibrationTables(requestedCovariates, numReadGroups, RAC.RECAL_TABLE_UPDATE_LOG); recalibrationEngine = initializeRecalibrationEngine(); - recalibrationEngine.initialize(requestedCovariates, recalibrationTables); + recalibrationEngine.initialize(requestedCovariates, numReadGroups, RAC.RECAL_TABLE_UPDATE_LOG); + if ( lowMemoryMode ) + recalibrationEngine.enableLowMemoryMode(); minimumQToUse = getToolkit().getArguments().PRESERVE_QSCORES_LESS_THAN; referenceReader = getToolkit().getReferenceDataSource().getReference(); @@ -501,14 +506,18 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche logger.info("Processed: " + result + " reads"); } + private RecalibrationTables getRecalibrationTable() { + return recalibrationEngine.getFinalRecalibrationTables(); + } + private void generatePlots() { File recalFile = getToolkit().getArguments().BQSR_RECAL_FILE; if (recalFile != null) { RecalibrationReport report = new RecalibrationReport(recalFile); - RecalUtils.generateRecalibrationPlot(RAC, report.getRecalibrationTables(), recalibrationTables, requestedCovariates); + RecalUtils.generateRecalibrationPlot(RAC, report.getRecalibrationTables(), getRecalibrationTable(), requestedCovariates); } else - RecalUtils.generateRecalibrationPlot(RAC, recalibrationTables, requestedCovariates); + RecalUtils.generateRecalibrationPlot(RAC, getRecalibrationTable(), requestedCovariates); } /** @@ -517,10 +526,10 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche * generate a quantization map (recalibrated_qual -> quantized_qual) */ private void quantizeQualityScores() { - quantizationInfo = new QuantizationInfo(recalibrationTables, RAC.QUANTIZING_LEVELS); + quantizationInfo = new QuantizationInfo(getRecalibrationTable(), RAC.QUANTIZING_LEVELS); } private void generateReport() { - RecalUtils.outputRecalibrationReport(RAC, quantizationInfo, recalibrationTables, requestedCovariates, RAC.SORT_BY_ALL_COLUMNS); + RecalUtils.outputRecalibrationReport(RAC, quantizationInfo, getRecalibrationTable(), requestedCovariates, RAC.SORT_BY_ALL_COLUMNS); } } \ No newline at end of file diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java index 5c002b7e5..6c3189be5 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java @@ -5,6 +5,8 @@ import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import java.io.PrintStream; + /* * Copyright (c) 2009 The Broad Institute * @@ -40,9 +42,10 @@ public interface RecalibrationEngine { * The engine should collect match and mismatch data into the recalibrationTables data. * * @param covariates an array of the covariates we'll be using in this engine, order matters - * @param recalibrationTables the destination recalibrationTables where stats should be collected + * @param numReadGroups the number of read groups we should use for the recalibration tables + * @param maybeLogStream an optional print stream for logging calls to the nestedhashmap in the recalibration tables */ - public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables); + public void initialize(final Covariate[] covariates, final int numReadGroups, final PrintStream maybeLogStream); /** * Update the recalibration statistics using the information in recalInfo @@ -57,4 +60,8 @@ public interface RecalibrationEngine { * Called once after all calls to updateDataForRead have been issued. */ public void finalizeData(); + + public void enableLowMemoryMode(); + + public RecalibrationTables getFinalRecalibrationTables(); } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java index a6ab98e8b..0cd042eeb 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java @@ -25,26 +25,64 @@ package org.broadinstitute.sting.gatk.walkers.bqsr; * OTHER DEALINGS IN THE SOFTWARE. */ +import com.google.java.contract.Requires; +import org.apache.log4j.Logger; import org.broadinstitute.sting.utils.classloader.PublicPackageSource; import org.broadinstitute.sting.utils.collections.NestedIntegerArray; -import org.broadinstitute.sting.utils.recalibration.EventType; -import org.broadinstitute.sting.utils.recalibration.ReadCovariates; -import org.broadinstitute.sting.utils.recalibration.RecalDatum; -import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; +import org.broadinstitute.sting.utils.recalibration.*; import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import java.io.PrintStream; +import java.util.LinkedList; +import java.util.List; + public class StandardRecalibrationEngine implements RecalibrationEngine, PublicPackageSource { + private static final Logger logger = Logger.getLogger(StandardRecalibrationEngine.class); protected Covariate[] covariates; - protected RecalibrationTables recalibrationTables; + private int numReadGroups; + private PrintStream maybeLogStream; + private boolean lowMemoryMode = false; + + private boolean finalized = false; + private RecalibrationTables mergedRecalibrationTables = null; + + private final List recalibrationTablesList = new LinkedList(); + + private final ThreadLocal threadLocalTables = new ThreadLocal() { + private synchronized RecalibrationTables makeAndCaptureTable() { + logger.info("Creating RecalibrationTable for " + Thread.currentThread()); + final RecalibrationTables newTable = new RecalibrationTables(covariates, numReadGroups, maybeLogStream); + recalibrationTablesList.add(newTable); + return newTable; + } + + @Override + protected synchronized RecalibrationTables initialValue() { + if ( lowMemoryMode ) { + return recalibrationTablesList.isEmpty() ? makeAndCaptureTable() : recalibrationTablesList.get(0); + } else { + return makeAndCaptureTable(); + } + } + }; + + protected RecalibrationTables getRecalibrationTables() { + return threadLocalTables.get(); + } + + public void enableLowMemoryMode() { + this.lowMemoryMode = true; + } @Override - public void initialize(final Covariate[] covariates, final RecalibrationTables recalibrationTables) { + public void initialize(final Covariate[] covariates, final int numReadGroups, final PrintStream maybeLogStream) { if ( covariates == null ) throw new IllegalArgumentException("Covariates cannot be null"); - if ( recalibrationTables == null ) throw new IllegalArgumentException("recalibrationTables cannot be null"); + if ( numReadGroups < 1 ) throw new IllegalArgumentException("numReadGroups must be >= 1 but got " + numReadGroups); this.covariates = covariates.clone(); - this.recalibrationTables = recalibrationTables; + this.numReadGroups = numReadGroups; + this.maybeLogStream = maybeLogStream; } @Override @@ -59,13 +97,13 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP final double isError = recalInfo.getErrorFraction(eventType, offset); final int[] keys = readCovariates.getKeySet(offset, eventType); - incrementDatumOrPutIfNecessary(recalibrationTables.getQualityScoreTable(), qual, isError, keys[0], keys[1], eventType.index); + incrementDatumOrPutIfNecessary(getRecalibrationTables().getQualityScoreTable(), qual, isError, keys[0], keys[1], eventType.index); for (int i = 2; i < covariates.length; i++) { if (keys[i] < 0) continue; - incrementDatumOrPutIfNecessary(recalibrationTables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventType.index); + incrementDatumOrPutIfNecessary(getRecalibrationTables().getTable(i), qual, isError, keys[0], keys[1], keys[i], eventType.index); } } } @@ -90,8 +128,13 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP */ @Override public void finalizeData() { - final NestedIntegerArray byReadGroupTable = recalibrationTables.getReadGroupTable(); - final NestedIntegerArray byQualTable = recalibrationTables.getQualityScoreTable(); + if ( finalized ) throw new IllegalStateException("FinalizeData() has already been called"); + + // merge all of the thread-local tables + mergedRecalibrationTables = mergeThreadLocalRecalibrationTables(); + + final NestedIntegerArray byReadGroupTable = mergedRecalibrationTables.getReadGroupTable(); + final NestedIntegerArray byQualTable = mergedRecalibrationTables.getQualityScoreTable(); // iterate over all values in the qual table for ( NestedIntegerArray.Leaf leaf : byQualTable.getAllLeaves() ) { @@ -108,6 +151,38 @@ public class StandardRecalibrationEngine implements RecalibrationEngine, PublicP rgDatum.combine(qualDatum); } } + + finalized = true; + } + + /** + * Merge all of the thread local recalibration tables into a single one. + * + * Reuses one of the recalibration tables to hold the merged table, so this function can only be + * called once in the engine. + * + * @return the merged recalibration table + */ + @Requires("! finalized") + private RecalibrationTables mergeThreadLocalRecalibrationTables() { + if ( recalibrationTablesList.isEmpty() ) throw new IllegalStateException("recalibration tables list is empty"); + + RecalibrationTables merged = null; + for ( final RecalibrationTables table : recalibrationTablesList ) { + if ( merged == null ) + // fast path -- if there's only only one table, so just make it the merged one + merged = table; + else { + merged.combine(table); + } + } + + return merged; + } + + public RecalibrationTables getFinalRecalibrationTables() { + if ( ! finalized ) throw new IllegalStateException("Cannot get final recalibration tables until finalizeData() has been called"); + return mergedRecalibrationTables; } /** diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java index 3f968d7f6..a6b1e13b9 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationTables.java @@ -123,12 +123,16 @@ public final class RecalibrationTables { } /** - * Merge in the quality score table information from qualityScoreTable into this - * recalibration table's quality score table. - * - * @param qualityScoreTable the quality score table we want to merge in + * Merge all of the tables from toMerge into into this set of tables */ - public void combineQualityScoreTable(final NestedIntegerArray qualityScoreTable) { - RecalUtils.combineTables(getQualityScoreTable(), qualityScoreTable); + public void combine(final RecalibrationTables toMerge) { + if ( numTables() != toMerge.numTables() ) + throw new IllegalArgumentException("Attempting to merge RecalibrationTables with different sizes"); + + for ( int i = 0; i < numTables(); i++ ) { + final NestedIntegerArray myTable = this.getTable(i); + final NestedIntegerArray otherTable = toMerge.getTable(i); + RecalUtils.combineTables(myTable, otherTable); + } } } From bbdf9ee91b895d353260f2b5219d1b3d6c3c3ce4 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Thu, 3 Jan 2013 16:47:45 -0500 Subject: [PATCH 12/18] BQSR cleanup: merge Advanced and Standard recalibration engine into just the RecalibrationEngine -- As we are no longer maintaining a public/protected system we need only have one RecalibrationEngine. -- Misc. code cleanup and docs along the way --- .../bqsr/AdvancedRecalibrationEngine.java | 65 ----- .../gatk/walkers/bqsr/BaseRecalibrator.java | 51 ++-- .../walkers/bqsr/RecalibrationEngine.java | 254 +++++++++++++++--- .../bqsr/StandardRecalibrationEngine.java | 219 --------------- 4 files changed, 249 insertions(+), 340 deletions(-) delete mode 100644 protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java delete mode 100644 public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java diff --git a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java b/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java deleted file mode 100644 index 3871101eb..000000000 --- a/protected/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/AdvancedRecalibrationEngine.java +++ /dev/null @@ -1,65 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.bqsr; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -import org.broadinstitute.sting.utils.classloader.ProtectedPackageSource; -import org.broadinstitute.sting.utils.collections.NestedIntegerArray; -import org.broadinstitute.sting.utils.recalibration.EventType; -import org.broadinstitute.sting.utils.recalibration.ReadCovariates; -import org.broadinstitute.sting.utils.recalibration.RecalDatum; -import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -public class AdvancedRecalibrationEngine extends StandardRecalibrationEngine implements ProtectedPackageSource { - @Override - public void updateDataForRead( final ReadRecalibrationInfo recalInfo ) { - final GATKSAMRecord read = recalInfo.getRead(); - final ReadCovariates readCovariates = recalInfo.getCovariatesValues(); - final RecalibrationTables tables = getRecalibrationTables(); - final NestedIntegerArray qualityScoreTable = tables.getQualityScoreTable(); - - for( int offset = 0; offset < read.getReadBases().length; offset++ ) { - if( ! recalInfo.skip(offset) ) { - - for (final EventType eventType : EventType.values()) { - final int[] keys = readCovariates.getKeySet(offset, eventType); - final int eventIndex = eventType.index; - final byte qual = recalInfo.getQual(eventType, offset); - final double isError = recalInfo.getErrorFraction(eventType, offset); - - incrementDatumOrPutIfNecessary(qualityScoreTable, qual, isError, keys[0], keys[1], eventIndex); - - for (int i = 2; i < covariates.length; i++) { - if (keys[i] < 0) - continue; - - incrementDatumOrPutIfNecessary(tables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex); - } - } - } - } - } -} diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java index ffcfd6233..2c774d94d 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/BaseRecalibrator.java @@ -113,12 +113,11 @@ import java.util.List; @ReadFilters({MappingQualityZeroFilter.class, MappingQualityUnavailableFilter.class, UnmappedReadFilter.class, NotPrimaryAlignmentFilter.class, DuplicateReadFilter.class, FailsVendorQualityCheckFilter.class}) @PartitionBy(PartitionType.READ) public class BaseRecalibrator extends ReadWalker implements NanoSchedulable { + /** + * all the command line arguments for BQSR and it's covariates + */ @ArgumentCollection - private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); // all the command line arguments for BQSR and it's covariates - - @Advanced - @Argument(fullName = "bqsrBAQGapOpenPenalty", shortName="bqsrBAQGOP", doc="BQSR BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false) - public double BAQGOP = BAQ.DEFAULT_GOP; + private final RecalibrationArgumentCollection RAC = new RecalibrationArgumentCollection(); /** * When you have nct > 1, BQSR uses nct times more memory to compute its recalibration tables, for efficiency @@ -129,9 +128,19 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche @Argument(fullName = "lowMemoryMode", shortName="lowMemoryMode", doc="Reduce memory usage in multi-threaded code at the expense of threading efficiency", required = false) public boolean lowMemoryMode = false; - private QuantizationInfo quantizationInfo; // an object that keeps track of the information necessary for quality score quantization + @Advanced + @Argument(fullName = "bqsrBAQGapOpenPenalty", shortName="bqsrBAQGOP", doc="BQSR BAQ gap open penalty (Phred Scaled). Default value is 40. 30 is perhaps better for whole genome call sets", required = false) + public double BAQGOP = BAQ.DEFAULT_GOP; - private Covariate[] requestedCovariates; // list to hold the all the covariate objects that were requested (required + standard + experimental) + /** + * an object that keeps track of the information necessary for quality score quantization + */ + private QuantizationInfo quantizationInfo; + + /** + * list to hold the all the covariate objects that were requested (required + standard + experimental) + */ + private Covariate[] requestedCovariates; private RecalibrationEngine recalibrationEngine; @@ -189,30 +198,20 @@ public class BaseRecalibrator extends ReadWalker implements NanoSche throw new UserException.CouldNotCreateOutputFile(RAC.RECAL_TABLE_FILE, e); } - int numReadGroups = 0; - for ( final SAMFileHeader header : getToolkit().getSAMFileHeaders() ) - numReadGroups += header.getReadGroups().size(); - - recalibrationEngine = initializeRecalibrationEngine(); - recalibrationEngine.initialize(requestedCovariates, numReadGroups, RAC.RECAL_TABLE_UPDATE_LOG); - if ( lowMemoryMode ) - recalibrationEngine.enableLowMemoryMode(); - + initializeRecalibrationEngine(); minimumQToUse = getToolkit().getArguments().PRESERVE_QSCORES_LESS_THAN; referenceReader = getToolkit().getReferenceDataSource().getReference(); } - private RecalibrationEngine initializeRecalibrationEngine() { + /** + * Initialize the recalibration engine + */ + private void initializeRecalibrationEngine() { + int numReadGroups = 0; + for ( final SAMFileHeader header : getToolkit().getSAMFileHeaders() ) + numReadGroups += header.getReadGroups().size(); - final Class recalibrationEngineClass = GATKLiteUtils.getProtectedClassIfAvailable(RecalibrationEngine.class); - try { - final Constructor constructor = recalibrationEngineClass.getDeclaredConstructor((Class[])null); - constructor.setAccessible(true); - return (RecalibrationEngine)constructor.newInstance(); - } - catch (Exception e) { - throw new ReviewedStingException("Unable to create RecalibrationEngine class instance " + recalibrationEngineClass.getSimpleName()); - } + recalibrationEngine = new RecalibrationEngine(requestedCovariates, numReadGroups, RAC.RECAL_TABLE_UPDATE_LOG, lowMemoryMode); } private boolean isLowQualityBase( final GATKSAMRecord read, final int offset ) { diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java index 6c3189be5..c6d5cddb9 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java @@ -1,37 +1,90 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + package org.broadinstitute.sting.gatk.walkers.bqsr; import com.google.java.contract.Requires; +import org.broadinstitute.sting.utils.collections.NestedIntegerArray; +import org.broadinstitute.sting.utils.recalibration.EventType; +import org.broadinstitute.sting.utils.recalibration.ReadCovariates; +import org.broadinstitute.sting.utils.recalibration.RecalDatum; import org.broadinstitute.sting.utils.recalibration.RecalibrationTables; import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; import org.broadinstitute.sting.utils.sam.GATKSAMRecord; import java.io.PrintStream; +import java.util.LinkedList; +import java.util.List; + +public class RecalibrationEngine { + final protected Covariate[] covariates; + final private int numReadGroups; + final private PrintStream maybeLogStream; + final private boolean lowMemoryMode; + + /** + * Has finalizeData() been called? + */ + private boolean finalized = false; + + /** + * The final (merged, etc) recalibration tables, suitable for downstream analysis. + */ + private RecalibrationTables finalRecalibrationTables = null; + + private final List recalibrationTablesList = new LinkedList(); + + private final ThreadLocal threadLocalTables = new ThreadLocal() { + private synchronized RecalibrationTables makeAndCaptureTable() { + final RecalibrationTables newTable = new RecalibrationTables(covariates, numReadGroups, maybeLogStream); + recalibrationTablesList.add(newTable); + return newTable; + } + + @Override + protected synchronized RecalibrationTables initialValue() { + if ( lowMemoryMode ) { + return recalibrationTablesList.isEmpty() ? makeAndCaptureTable() : recalibrationTablesList.get(0); + } else { + return makeAndCaptureTable(); + } + } + }; + + /** + * Get a recalibration table suitable for updating the underlying RecalDatums + * + * May return a thread-local version, or a single version, depending on the initialization + * arguments of this instance. + * + * @return + */ + protected RecalibrationTables getUpdatableRecalibrationTables() { + return threadLocalTables.get(); + } -/* -* Copyright (c) 2009 The Broad Institute -* -* Permission is hereby granted, free of charge, to any person -* obtaining a copy of this software and associated documentation -* files (the "Software"), to deal in the Software without -* restriction, including without limitation the rights to use, -* copy, modify, merge, publish, distribute, sublicense, and/or sell -* copies of the Software, and to permit persons to whom the -* Software is furnished to do so, subject to the following -* conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -* OTHER DEALINGS IN THE SOFTWARE. -*/ -public interface RecalibrationEngine { /** * Initialize the recalibration engine * @@ -45,23 +98,164 @@ public interface RecalibrationEngine { * @param numReadGroups the number of read groups we should use for the recalibration tables * @param maybeLogStream an optional print stream for logging calls to the nestedhashmap in the recalibration tables */ - public void initialize(final Covariate[] covariates, final int numReadGroups, final PrintStream maybeLogStream); + public RecalibrationEngine(final Covariate[] covariates, final int numReadGroups, final PrintStream maybeLogStream, final boolean enableLowMemoryMode) { + if ( covariates == null ) throw new IllegalArgumentException("Covariates cannot be null"); + if ( numReadGroups < 1 ) throw new IllegalArgumentException("numReadGroups must be >= 1 but got " + numReadGroups); + + this.covariates = covariates.clone(); + this.numReadGroups = numReadGroups; + this.maybeLogStream = maybeLogStream; + this.lowMemoryMode = enableLowMemoryMode; + } /** * Update the recalibration statistics using the information in recalInfo * @param recalInfo data structure holding information about the recalibration values for a single read */ @Requires("recalInfo != null") - public void updateDataForRead(final ReadRecalibrationInfo recalInfo); + public void updateDataForRead( final ReadRecalibrationInfo recalInfo ) { + final GATKSAMRecord read = recalInfo.getRead(); + final ReadCovariates readCovariates = recalInfo.getCovariatesValues(); + final RecalibrationTables tables = getUpdatableRecalibrationTables(); + final NestedIntegerArray qualityScoreTable = tables.getQualityScoreTable(); + + for( int offset = 0; offset < read.getReadBases().length; offset++ ) { + if( ! recalInfo.skip(offset) ) { + + for (final EventType eventType : EventType.values()) { + final int[] keys = readCovariates.getKeySet(offset, eventType); + final int eventIndex = eventType.index; + final byte qual = recalInfo.getQual(eventType, offset); + final double isError = recalInfo.getErrorFraction(eventType, offset); + + incrementDatumOrPutIfNecessary(qualityScoreTable, qual, isError, keys[0], keys[1], eventIndex); + + for (int i = 2; i < covariates.length; i++) { + if (keys[i] < 0) + continue; + + incrementDatumOrPutIfNecessary(tables.getTable(i), qual, isError, keys[0], keys[1], keys[i], eventIndex); + } + } + } + } + } + + /** + * creates a datum object with one observation and one or zero error + * + * @param reportedQual the quality score reported by the instrument for this base + * @param isError whether or not the observation is an error + * @return a new RecalDatum object with the observation and the error + */ + protected RecalDatum createDatumObject(final byte reportedQual, final double isError) { + return new RecalDatum(1, isError, reportedQual); + } /** * Finalize, if appropriate, all derived data in recalibrationTables. * * Called once after all calls to updateDataForRead have been issued. + * + * Assumes that all of the principal tables (by quality score) have been completely updated, + * and walks over this data to create summary data tables like by read group table. */ - public void finalizeData(); + public void finalizeData() { + if ( finalized ) throw new IllegalStateException("FinalizeData() has already been called"); - public void enableLowMemoryMode(); + // merge all of the thread-local tables + finalRecalibrationTables = mergeThreadLocalRecalibrationTables(); - public RecalibrationTables getFinalRecalibrationTables(); + final NestedIntegerArray byReadGroupTable = finalRecalibrationTables.getReadGroupTable(); + final NestedIntegerArray byQualTable = finalRecalibrationTables.getQualityScoreTable(); + + // iterate over all values in the qual table + for ( NestedIntegerArray.Leaf leaf : byQualTable.getAllLeaves() ) { + final int rgKey = leaf.keys[0]; + final int eventIndex = leaf.keys[2]; + final RecalDatum rgDatum = byReadGroupTable.get(rgKey, eventIndex); + final RecalDatum qualDatum = leaf.value; + + if ( rgDatum == null ) { + // create a copy of qualDatum, and initialize byReadGroup table with it + byReadGroupTable.put(new RecalDatum(qualDatum), rgKey, eventIndex); + } else { + // combine the qual datum with the existing datum in the byReadGroup table + rgDatum.combine(qualDatum); + } + } + + finalized = true; + } + + /** + * Merge all of the thread local recalibration tables into a single one. + * + * Reuses one of the recalibration tables to hold the merged table, so this function can only be + * called once in the engine. + * + * @return the merged recalibration table + */ + @Requires("! finalized") + private RecalibrationTables mergeThreadLocalRecalibrationTables() { + if ( recalibrationTablesList.isEmpty() ) throw new IllegalStateException("recalibration tables list is empty"); + + RecalibrationTables merged = null; + for ( final RecalibrationTables table : recalibrationTablesList ) { + if ( merged == null ) + // fast path -- if there's only only one table, so just make it the merged one + merged = table; + else { + merged.combine(table); + } + } + + return merged; + } + + /** + * Get the final recalibration tables, after finalizeData() has been called + * + * This returns the finalized recalibration table collected by this engine. + * + * It is an error to call this function before finalizeData has been called + * + * @return the finalized recalibration table collected by this engine + */ + public RecalibrationTables getFinalRecalibrationTables() { + if ( ! finalized ) throw new IllegalStateException("Cannot get final recalibration tables until finalizeData() has been called"); + return finalRecalibrationTables; + } + + /** + * Increments the RecalDatum at the specified position in the specified table, or put a new item there + * if there isn't already one. + * + * Does this in a thread-safe way WITHOUT being synchronized: relies on the behavior of NestedIntegerArray.put() + * to return false if another thread inserts a new item at our position in the middle of our put operation. + * + * @param table the table that holds/will hold our item + * @param qual qual for this event + * @param isError error value for this event + * @param keys location in table of our item + */ + protected void incrementDatumOrPutIfNecessary( final NestedIntegerArray table, + final byte qual, + final double isError, + final int... keys ) { + final RecalDatum existingDatum = table.get(keys); + + if ( existingDatum == null ) { + // No existing item, try to put a new one + if ( ! table.put(createDatumObject(qual, isError), keys) ) { + // Failed to put a new item because another thread came along and put an item here first. + // Get the newly-put item and increment it (item is guaranteed to exist at this point) + table.get(keys).increment(1.0, isError); + } + } + else { + // Easy case: already an item here, so increment it + existingDatum.increment(1.0, isError); + } + } } diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java deleted file mode 100644 index 0cd042eeb..000000000 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/StandardRecalibrationEngine.java +++ /dev/null @@ -1,219 +0,0 @@ -package org.broadinstitute.sting.gatk.walkers.bqsr; - -/* - * Copyright (c) 2009 The Broad Institute - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -import com.google.java.contract.Requires; -import org.apache.log4j.Logger; -import org.broadinstitute.sting.utils.classloader.PublicPackageSource; -import org.broadinstitute.sting.utils.collections.NestedIntegerArray; -import org.broadinstitute.sting.utils.recalibration.*; -import org.broadinstitute.sting.utils.recalibration.covariates.Covariate; -import org.broadinstitute.sting.utils.sam.GATKSAMRecord; - -import java.io.PrintStream; -import java.util.LinkedList; -import java.util.List; - -public class StandardRecalibrationEngine implements RecalibrationEngine, PublicPackageSource { - private static final Logger logger = Logger.getLogger(StandardRecalibrationEngine.class); - protected Covariate[] covariates; - private int numReadGroups; - private PrintStream maybeLogStream; - private boolean lowMemoryMode = false; - - private boolean finalized = false; - private RecalibrationTables mergedRecalibrationTables = null; - - private final List recalibrationTablesList = new LinkedList(); - - private final ThreadLocal threadLocalTables = new ThreadLocal() { - private synchronized RecalibrationTables makeAndCaptureTable() { - logger.info("Creating RecalibrationTable for " + Thread.currentThread()); - final RecalibrationTables newTable = new RecalibrationTables(covariates, numReadGroups, maybeLogStream); - recalibrationTablesList.add(newTable); - return newTable; - } - - @Override - protected synchronized RecalibrationTables initialValue() { - if ( lowMemoryMode ) { - return recalibrationTablesList.isEmpty() ? makeAndCaptureTable() : recalibrationTablesList.get(0); - } else { - return makeAndCaptureTable(); - } - } - }; - - protected RecalibrationTables getRecalibrationTables() { - return threadLocalTables.get(); - } - - public void enableLowMemoryMode() { - this.lowMemoryMode = true; - } - - @Override - public void initialize(final Covariate[] covariates, final int numReadGroups, final PrintStream maybeLogStream) { - if ( covariates == null ) throw new IllegalArgumentException("Covariates cannot be null"); - if ( numReadGroups < 1 ) throw new IllegalArgumentException("numReadGroups must be >= 1 but got " + numReadGroups); - - this.covariates = covariates.clone(); - this.numReadGroups = numReadGroups; - this.maybeLogStream = maybeLogStream; - } - - @Override - public void updateDataForRead( final ReadRecalibrationInfo recalInfo ) { - final GATKSAMRecord read = recalInfo.getRead(); - final EventType eventType = EventType.BASE_SUBSTITUTION; - final ReadCovariates readCovariates = recalInfo.getCovariatesValues(); - - for( int offset = 0; offset < read.getReadBases().length; offset++ ) { - if( ! recalInfo.skip(offset) ) { - final byte qual = recalInfo.getQual(eventType, offset); - final double isError = recalInfo.getErrorFraction(eventType, offset); - final int[] keys = readCovariates.getKeySet(offset, eventType); - - incrementDatumOrPutIfNecessary(getRecalibrationTables().getQualityScoreTable(), qual, isError, keys[0], keys[1], eventType.index); - - for (int i = 2; i < covariates.length; i++) { - if (keys[i] < 0) - continue; - - incrementDatumOrPutIfNecessary(getRecalibrationTables().getTable(i), qual, isError, keys[0], keys[1], keys[i], eventType.index); - } - } - } - } - - /** - * creates a datum object with one observation and one or zero error - * - * @param reportedQual the quality score reported by the instrument for this base - * @param isError whether or not the observation is an error - * @return a new RecalDatum object with the observation and the error - */ - protected RecalDatum createDatumObject(final byte reportedQual, final double isError) { - return new RecalDatum(1, isError, reportedQual); - } - - /** - * Create derived recalibration data tables - * - * Assumes that all of the principal tables (by quality score) have been completely updated, - * and walks over this data to create summary data tables like by read group table. - */ - @Override - public void finalizeData() { - if ( finalized ) throw new IllegalStateException("FinalizeData() has already been called"); - - // merge all of the thread-local tables - mergedRecalibrationTables = mergeThreadLocalRecalibrationTables(); - - final NestedIntegerArray byReadGroupTable = mergedRecalibrationTables.getReadGroupTable(); - final NestedIntegerArray byQualTable = mergedRecalibrationTables.getQualityScoreTable(); - - // iterate over all values in the qual table - for ( NestedIntegerArray.Leaf leaf : byQualTable.getAllLeaves() ) { - final int rgKey = leaf.keys[0]; - final int eventIndex = leaf.keys[2]; - final RecalDatum rgDatum = byReadGroupTable.get(rgKey, eventIndex); - final RecalDatum qualDatum = leaf.value; - - if ( rgDatum == null ) { - // create a copy of qualDatum, and initialize byReadGroup table with it - byReadGroupTable.put(new RecalDatum(qualDatum), rgKey, eventIndex); - } else { - // combine the qual datum with the existing datum in the byReadGroup table - rgDatum.combine(qualDatum); - } - } - - finalized = true; - } - - /** - * Merge all of the thread local recalibration tables into a single one. - * - * Reuses one of the recalibration tables to hold the merged table, so this function can only be - * called once in the engine. - * - * @return the merged recalibration table - */ - @Requires("! finalized") - private RecalibrationTables mergeThreadLocalRecalibrationTables() { - if ( recalibrationTablesList.isEmpty() ) throw new IllegalStateException("recalibration tables list is empty"); - - RecalibrationTables merged = null; - for ( final RecalibrationTables table : recalibrationTablesList ) { - if ( merged == null ) - // fast path -- if there's only only one table, so just make it the merged one - merged = table; - else { - merged.combine(table); - } - } - - return merged; - } - - public RecalibrationTables getFinalRecalibrationTables() { - if ( ! finalized ) throw new IllegalStateException("Cannot get final recalibration tables until finalizeData() has been called"); - return mergedRecalibrationTables; - } - - /** - * Increments the RecalDatum at the specified position in the specified table, or put a new item there - * if there isn't already one. - * - * Does this in a thread-safe way WITHOUT being synchronized: relies on the behavior of NestedIntegerArray.put() - * to return false if another thread inserts a new item at our position in the middle of our put operation. - * - * @param table the table that holds/will hold our item - * @param qual qual for this event - * @param isError error value for this event - * @param keys location in table of our item - */ - protected void incrementDatumOrPutIfNecessary( final NestedIntegerArray table, - final byte qual, - final double isError, - final int... keys ) { - final RecalDatum existingDatum = table.get(keys); - - if ( existingDatum == null ) { - // No existing item, try to put a new one - if ( ! table.put(createDatumObject(qual, isError), keys) ) { - // Failed to put a new item because another thread came along and put an item here first. - // Get the newly-put item and increment it (item is guaranteed to exist at this point) - table.get(keys).increment(1.0, isError); - } - } - else { - // Easy case: already an item here, so increment it - existingDatum.increment(1.0, isError); - } - } -} From a5901cdd2037833568fa13e38f74952e261cc80a Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 4 Jan 2013 10:54:02 -0500 Subject: [PATCH 14/18] Bugfix for printProgress in TraverseReadsNano -- Must provide a single bp position (1:10) not the range of the read (1:1-50). ProgressMeter now checks at runtime for this problem as well. --- .../sting/gatk/traversals/TraverseReadsNano.java | 3 ++- .../sting/utils/progressmeter/ProgressMeter.java | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java index ee71d82bb..aa33def62 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java +++ b/public/java/src/org/broadinstitute/sting/gatk/traversals/TraverseReadsNano.java @@ -65,7 +65,8 @@ public class TraverseReadsNano extends TraversalEngine, @Override public void progress(MapData lastProcessedMap) { if ( lastProcessedMap.refContext != null ) - printProgress(lastProcessedMap.refContext.getLocus()); + // note, need to use getStopLocation so we don't give an interval to ProgressMeterDaemon + printProgress(lastProcessedMap.refContext.getLocus().getStopLocation()); } }); } diff --git a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java index c9d849227..b36326722 100755 --- a/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java +++ b/public/java/src/org/broadinstitute/sting/utils/progressmeter/ProgressMeter.java @@ -234,11 +234,13 @@ public class ProgressMeter { * the progress itself. A separate printing daemon periodically polls the meter to print out * progress * - * @param loc Current location, can be null if you are at the end of the processing unit + * @param loc Current location, can be null if you are at the end of the processing unit. Must + * have size == 1 (cannot be multiple bases in size). * @param nTotalRecordsProcessed the total number of records we've processed */ public synchronized void notifyOfProgress(final GenomeLoc loc, final long nTotalRecordsProcessed) { if ( nTotalRecordsProcessed < 0 ) throw new IllegalArgumentException("nTotalRecordsProcessed must be >= 0"); + if ( loc.size() != 1 ) throw new IllegalArgumentException("GenomeLoc must have size == 1 but got " + loc); // weird comparison to ensure that loc == null (in unmapped reads) is keep before maxGenomeLoc == null (on startup) this.maxGenomeLoc = loc == null ? loc : (maxGenomeLoc == null ? loc : loc.max(maxGenomeLoc)); From 810e2da1d47b973f93be2102de2205310ea98a37 Mon Sep 17 00:00:00 2001 From: Mark DePristo Date: Fri, 4 Jan 2013 11:38:58 -0500 Subject: [PATCH 15/18] Cleanup and unit tests for EventType and ReadRecalibrationInfo in BQSR -- Added unit tests for EventType and ReadRecalibrationInfo -- Simplified interface of EventType. Previously this enum carried an index with it, but this is redundant with the enum.ordinal function. Now just using that function instead. --- .../walkers/bqsr/ReadRecalibrationInfo.java | 7 +- .../walkers/bqsr/RecalibrationEngine.java | 2 +- .../recalibration/BaseRecalibration.java | 10 +- .../sting/utils/recalibration/EventType.java | 42 ++++--- .../utils/recalibration/ReadCovariates.java | 24 ++-- .../recalibration/RecalibrationReport.java | 6 +- .../bqsr/ReadRecalibrationInfoUnitTest.java | 110 ++++++++++++++++++ .../recalibration/EventTypeUnitTest.java | 61 ++++++++++ .../RecalibrationReportUnitTest.java | 8 +- 9 files changed, 216 insertions(+), 54 deletions(-) create mode 100644 public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfoUnitTest.java create mode 100644 public/java/test/org/broadinstitute/sting/utils/recalibration/EventTypeUnitTest.java diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java index 121e3449b..b884b89db 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfo.java @@ -58,7 +58,8 @@ public final class ReadRecalibrationInfo { if ( covariates == null ) throw new IllegalArgumentException("covariates cannot be null"); if ( skips == null ) throw new IllegalArgumentException("skips cannot be null"); if ( snpErrors == null ) throw new IllegalArgumentException("snpErrors cannot be null"); - // future: may allow insertionErrors && deletionErrors to be null, so don't enforce + if ( insertionErrors == null ) throw new IllegalArgumentException("insertionErrors cannot be null"); + if ( deletionErrors == null ) throw new IllegalArgumentException("deletionErrors cannot be null"); this.read = read; this.baseQuals = read.getBaseQualities(); @@ -73,8 +74,8 @@ public final class ReadRecalibrationInfo { if ( skips.length != length ) throw new IllegalArgumentException("skips.length " + snpErrors.length + " != length " + length); if ( snpErrors.length != length ) throw new IllegalArgumentException("snpErrors.length " + snpErrors.length + " != length " + length); - if ( insertionErrors != null && insertionErrors.length != length ) throw new IllegalArgumentException("insertionErrors.length " + snpErrors.length + " != length " + length); - if ( deletionErrors != null && deletionErrors.length != length ) throw new IllegalArgumentException("deletionErrors.length " + snpErrors.length + " != length " + length); + if ( insertionErrors.length != length ) throw new IllegalArgumentException("insertionErrors.length " + snpErrors.length + " != length " + length); + if ( deletionErrors.length != length ) throw new IllegalArgumentException("deletionErrors.length " + snpErrors.length + " != length " + length); } /** diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java index c6d5cddb9..910519031 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/bqsr/RecalibrationEngine.java @@ -124,7 +124,7 @@ public class RecalibrationEngine { for (final EventType eventType : EventType.values()) { final int[] keys = readCovariates.getKeySet(offset, eventType); - final int eventIndex = eventType.index; + final int eventIndex = eventType.ordinal(); final byte qual = recalInfo.getQual(eventType, offset); final double isError = recalInfo.getErrorFraction(eventType, offset); diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java index 3e0f36799..43c9bd2b5 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/BaseRecalibration.java @@ -198,7 +198,7 @@ public class BaseRecalibration { } private double getGlobalDeltaQ(final int rgKey, final EventType errorModel) { - final Double cached = globalDeltaQs.get(rgKey, errorModel.index); + final Double cached = globalDeltaQs.get(rgKey, errorModel.ordinal()); if ( TEST_CACHING ) { final double calcd = calculateGlobalDeltaQ(rgKey, errorModel); @@ -210,7 +210,7 @@ public class BaseRecalibration { } private double getDeltaQReported(final int rgKey, final int qualKey, final EventType errorModel, final double globalDeltaQ) { - final Double cached = deltaQReporteds.get(rgKey, qualKey, errorModel.index); + final Double cached = deltaQReporteds.get(rgKey, qualKey, errorModel.ordinal()); if ( TEST_CACHING ) { final double calcd = calculateDeltaQReported(rgKey, qualKey, errorModel, globalDeltaQ, (byte)qualKey); @@ -240,7 +240,7 @@ public class BaseRecalibration { private double calculateGlobalDeltaQ(final int rgKey, final EventType errorModel) { double result = 0.0; - final RecalDatum empiricalQualRG = recalibrationTables.getReadGroupTable().get(rgKey, errorModel.index); + final RecalDatum empiricalQualRG = recalibrationTables.getReadGroupTable().get(rgKey, errorModel.ordinal()); if (empiricalQualRG != null) { final double globalDeltaQEmpirical = empiricalQualRG.getEmpiricalQuality(); @@ -254,7 +254,7 @@ public class BaseRecalibration { private double calculateDeltaQReported(final int rgKey, final int qualKey, final EventType errorModel, final double globalDeltaQ, final byte qualFromRead) { double result = 0.0; - final RecalDatum empiricalQualQS = recalibrationTables.getQualityScoreTable().get(rgKey, qualKey, errorModel.index); + final RecalDatum empiricalQualQS = recalibrationTables.getQualityScoreTable().get(rgKey, qualKey, errorModel.ordinal()); if (empiricalQualQS != null) { final double deltaQReportedEmpirical = empiricalQualQS.getEmpiricalQuality(); result = deltaQReportedEmpirical - qualFromRead - globalDeltaQ; @@ -287,7 +287,7 @@ public class BaseRecalibration { final double globalDeltaQ, final double deltaQReported, final byte qualFromRead) { - final RecalDatum empiricalQualCO = table.get(rgKey, qualKey, tableKey, errorModel.index); + final RecalDatum empiricalQualCO = table.get(rgKey, qualKey, tableKey, errorModel.ordinal()); if (empiricalQualCO != null) { final double deltaQCovariateEmpirical = empiricalQualCO.getEmpiricalQuality(); return deltaQCovariateEmpirical - qualFromRead - (globalDeltaQ + deltaQReported); diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/EventType.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/EventType.java index 1c84518eb..63f873892 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/EventType.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/EventType.java @@ -1,41 +1,39 @@ package org.broadinstitute.sting.utils.recalibration; -import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; - public enum EventType { - BASE_SUBSTITUTION(0, "M", "Base Substitution"), - BASE_INSERTION(1, "I", "Base Insertion"), - BASE_DELETION(2, "D", "Base Deletion"); + BASE_SUBSTITUTION("M", "Base Substitution"), + BASE_INSERTION("I", "Base Insertion"), + BASE_DELETION("D", "Base Deletion"); - public final int index; private final String representation; private final String longRepresentation; - private EventType(int index, String representation, String longRepresentation) { - this.index = index; + private EventType(String representation, String longRepresentation) { this.representation = representation; this.longRepresentation = longRepresentation; } + /** + * Get the EventType corresponding to its ordinal index + * @param index an ordinal index + * @return the event type corresponding to ordinal index + */ public static EventType eventFrom(int index) { - switch (index) { - case 0: - return BASE_SUBSTITUTION; - case 1: - return BASE_INSERTION; - case 2: - return BASE_DELETION; - default: - throw new ReviewedStingException(String.format("Event %d does not exist.", index)); - } + return EventType.values()[index]; } - - public static EventType eventFrom(String event) { + + /** + * Get the EventType with short string representation + * @throws IllegalArgumentException if representation doesn't correspond to one of EventType + * @param representation short string representation of the event + * @return an EventType + */ + public static EventType eventFrom(String representation) { for (EventType eventType : EventType.values()) - if (eventType.representation.equals(event)) + if (eventType.representation.equals(representation)) return eventType; - throw new ReviewedStingException(String.format("Event %s does not exist.", event)); + throw new IllegalArgumentException(String.format("Event %s does not exist.", representation)); } @Override diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java index 4ddcb2b92..405b2d143 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/ReadCovariates.java @@ -68,9 +68,9 @@ public class ReadCovariates { * @param readOffset the read offset, must be >= 0 and <= the read length used to create this ReadCovariates */ public void addCovariate(final int mismatch, final int insertion, final int deletion, final int readOffset) { - keys[EventType.BASE_SUBSTITUTION.index][readOffset][currentCovariateIndex] = mismatch; - keys[EventType.BASE_INSERTION.index][readOffset][currentCovariateIndex] = insertion; - keys[EventType.BASE_DELETION.index][readOffset][currentCovariateIndex] = deletion; + keys[EventType.BASE_SUBSTITUTION.ordinal()][readOffset][currentCovariateIndex] = mismatch; + keys[EventType.BASE_INSERTION.ordinal()][readOffset][currentCovariateIndex] = insertion; + keys[EventType.BASE_DELETION.ordinal()][readOffset][currentCovariateIndex] = deletion; } /** @@ -81,11 +81,11 @@ public class ReadCovariates { * @return */ public int[] getKeySet(final int readPosition, final EventType errorModel) { - return keys[errorModel.index][readPosition]; + return keys[errorModel.ordinal()][readPosition]; } public int[][] getKeySet(final EventType errorModel) { - return keys[errorModel.index]; + return keys[errorModel.ordinal()]; } // ---------------------------------------------------------------------- @@ -94,17 +94,9 @@ public class ReadCovariates { // // ---------------------------------------------------------------------- - protected int[][] getMismatchesKeySet() { - return keys[EventType.BASE_SUBSTITUTION.index]; - } - - protected int[][] getInsertionsKeySet() { - return keys[EventType.BASE_INSERTION.index]; - } - - protected int[][] getDeletionsKeySet() { - return keys[EventType.BASE_DELETION.index]; - } + protected int[][] getMismatchesKeySet() { return getKeySet(EventType.BASE_SUBSTITUTION); } + protected int[][] getInsertionsKeySet() { return getKeySet(EventType.BASE_INSERTION); } + protected int[][] getDeletionsKeySet() { return getKeySet(EventType.BASE_DELETION); } protected int[] getMismatchesKeySet(final int readPosition) { return getKeySet(readPosition, EventType.BASE_SUBSTITUTION); diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java index 6ecac1394..ff0890ff0 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/RecalibrationReport.java @@ -142,7 +142,7 @@ public class RecalibrationReport { tempCOVarray[2] = requestedCovariates[RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index + covIndex].keyFromValue(covValue); final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalUtils.EVENT_TYPE_COLUMN_NAME)); - tempCOVarray[3] = event.index; + tempCOVarray[3] = event.ordinal(); recalibrationTables.getTable(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index + covIndex).put(getRecalDatum(reportTable, i, false), tempCOVarray); } @@ -161,7 +161,7 @@ public class RecalibrationReport { final Object qual = reportTable.get(i, RecalUtils.QUALITY_SCORE_COLUMN_NAME); tempQUALarray[1] = requestedCovariates[1].keyFromValue(qual); final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalUtils.EVENT_TYPE_COLUMN_NAME)); - tempQUALarray[2] = event.index; + tempQUALarray[2] = event.ordinal(); qualTable.put(getRecalDatum(reportTable, i, false), tempQUALarray); } @@ -178,7 +178,7 @@ public class RecalibrationReport { final Object rg = reportTable.get(i, RecalUtils.READGROUP_COLUMN_NAME); tempRGarray[0] = requestedCovariates[0].keyFromValue(rg); final EventType event = EventType.eventFrom((String)reportTable.get(i, RecalUtils.EVENT_TYPE_COLUMN_NAME)); - tempRGarray[1] = event.index; + tempRGarray[1] = event.ordinal(); rgTable.put(getRecalDatum(reportTable, i, true), tempRGarray); } diff --git a/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfoUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfoUnitTest.java new file mode 100644 index 000000000..08a8f2dc1 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/bqsr/ReadRecalibrationInfoUnitTest.java @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.gatk.walkers.bqsr; + +import net.sf.samtools.SAMUtils; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.recalibration.EventType; +import org.broadinstitute.sting.utils.recalibration.ReadCovariates; +import org.broadinstitute.sting.utils.sam.ArtificialSAMUtils; +import org.broadinstitute.sting.utils.sam.GATKSAMRecord; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.EnumMap; +import java.util.List; + +public final class ReadRecalibrationInfoUnitTest extends BaseTest { + @DataProvider(name = "InfoProvider") + public Object[][] createCombineTablesProvider() { + List tests = new ArrayList(); + + for ( final int readLength: Arrays.asList(10, 100, 1000) ) { + for ( final boolean includeIndelErrors : Arrays.asList(true, false) ) { + tests.add(new Object[]{readLength, includeIndelErrors}); + } + } + + return tests.toArray(new Object[][]{}); + } + + @Test(dataProvider = "InfoProvider") + public void testReadInfo(final int readLength, final boolean includeIndelErrors) { + final ReadCovariates covariates = new ReadCovariates(readLength, 2); + + final byte[] bases = new byte[readLength]; + final byte[] baseQuals = new byte[readLength]; + final byte[] insertionQuals = new byte[readLength]; + final byte[] deletionQuals = new byte[readLength]; + final boolean[] skips = new boolean[readLength]; + final double[] snpErrors = new double[readLength]; + final double[] insertionErrors = new double[readLength]; + final double[] deletionsErrors = new double[readLength]; + for ( int i = 0; i < readLength; i++ ) { + bases[i] = 'A'; + baseQuals[i] = (byte)(i % SAMUtils.MAX_PHRED_SCORE); + insertionQuals[i] = (byte)((i+1) % SAMUtils.MAX_PHRED_SCORE); + deletionQuals[i] = (byte)((i+2) % SAMUtils.MAX_PHRED_SCORE); + skips[i] = i % 2 == 0; + snpErrors[i] = 1.0 / (i+1); + insertionErrors[i] = 0.5 / (i+1); + deletionsErrors[i] = 0.3 / (i+1); + } + + final EnumMap errors = new EnumMap(EventType.class); + errors.put(EventType.BASE_SUBSTITUTION, snpErrors); + errors.put(EventType.BASE_INSERTION, insertionErrors); + errors.put(EventType.BASE_DELETION, deletionsErrors); + + final EnumMap quals = new EnumMap(EventType.class); + quals.put(EventType.BASE_SUBSTITUTION, baseQuals); + quals.put(EventType.BASE_INSERTION, insertionQuals); + quals.put(EventType.BASE_DELETION, deletionQuals); + + final GATKSAMRecord read = ArtificialSAMUtils.createArtificialRead(bases, baseQuals, readLength + "M"); + if ( includeIndelErrors ) { + read.setBaseQualities(insertionQuals, EventType.BASE_INSERTION); + read.setBaseQualities(deletionQuals, EventType.BASE_DELETION); + } + + final ReadRecalibrationInfo info = new ReadRecalibrationInfo(read, covariates, skips, snpErrors, insertionErrors, deletionsErrors); + + Assert.assertEquals(info.getCovariatesValues(), covariates); + Assert.assertEquals(info.getRead(), read); + + for ( int i = 0; i < readLength; i++ ) { + Assert.assertEquals(info.skip(i), skips[i]); + for ( final EventType et : EventType.values() ) { + Assert.assertEquals(info.getErrorFraction(et, i), errors.get(et)[i]); + final byte expectedQual = et == EventType.BASE_SUBSTITUTION || includeIndelErrors ? quals.get(et)[i]: GATKSAMRecord.DEFAULT_INSERTION_DELETION_QUAL; + Assert.assertEquals(info.getQual(et, i), expectedQual); + } + } + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/EventTypeUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/EventTypeUnitTest.java new file mode 100644 index 000000000..53645e224 --- /dev/null +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/EventTypeUnitTest.java @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2012 The Broad Institute + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR + * THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +package org.broadinstitute.sting.utils.recalibration; + +import org.broadinstitute.sting.BaseTest; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.util.*; + +public final class EventTypeUnitTest extends BaseTest { + @Test + public void testEventTypes() { + for ( final EventType et : EventType.values() ) { + Assert.assertNotNull(et.toString()); + Assert.assertNotNull(et.prettyPrint()); + Assert.assertFalse("".equals(et.toString())); + Assert.assertFalse("".equals(et.prettyPrint())); + Assert.assertEquals(EventType.eventFrom(et.ordinal()), et); + Assert.assertEquals(EventType.eventFrom(et.toString()), et); + } + } + + @Test + public void testEventTypesEnumItself() { + final Set shortReps = new HashSet(); + for ( final EventType et : EventType.values() ) { + Assert.assertFalse(shortReps.contains(et.toString()), "Short representative for EventType has duplicates for " + et); + shortReps.add(et.toString()); + } + Assert.assertEquals(shortReps.size(), EventType.values().length, "Short representatives for EventType aren't unique"); + } + + @Test(expectedExceptions = IllegalArgumentException.class) + public void testBadString() { + EventType.eventFrom("asdfhalsdjfalkjsdf"); + } +} diff --git a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java index 9707ed078..aa0419fed 100644 --- a/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/utils/recalibration/RecalibrationReportUnitTest.java @@ -20,7 +20,7 @@ public class RecalibrationReportUnitTest { private static RecalDatum createRandomRecalDatum(int maxObservations, int maxErrors) { final Random random = new Random(); final int nObservations = random.nextInt(maxObservations); - final int nErrors = random.nextInt(maxErrors); + final int nErrors = Math.min(random.nextInt(maxErrors), nObservations); final int qual = random.nextInt(QualityUtils.MAX_QUAL_SCORE); return new RecalDatum(nObservations, nErrors, (byte)qual); } @@ -90,14 +90,14 @@ public class RecalibrationReportUnitTest { final int[] covariates = rc.getKeySet(offset, errorMode); final int randomMax = errorMode == EventType.BASE_SUBSTITUTION ? 10000 : 100000; - rgTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], errorMode.index); - qualTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], errorMode.index); + rgTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], errorMode.ordinal()); + qualTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], errorMode.ordinal()); nKeys += 2; for (int j = 0; j < optionalCovariates.size(); j++) { final NestedIntegerArray covTable = recalibrationTables.getTable(RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index + j); final int covValue = covariates[RecalibrationTables.TableType.OPTIONAL_COVARIATE_TABLES_START.index + j]; if ( covValue >= 0 ) { - covTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], covValue, errorMode.index); + covTable.put(createRandomRecalDatum(randomMax, 10), covariates[0], covariates[1], covValue, errorMode.ordinal()); nKeys++; } } From fe06912a87c2a55f3f22b5760855afc9d4987afe Mon Sep 17 00:00:00 2001 From: Tad Jordan Date: Fri, 4 Jan 2013 11:52:04 -0500 Subject: [PATCH 16/18] Removed sorting by row from walkers --- .../sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java | 2 +- .../gatk/walkers/varianteval/VariantEvalReportWriter.java | 2 +- .../sting/utils/recalibration/QuantizationInfo.java | 4 ++-- .../broadinstitute/sting/gatk/report/GATKReportUnitTest.java | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java index 5972322f8..98e581e21 100755 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/diagnostics/ErrorRatePerCycle.java @@ -124,7 +124,7 @@ public class ErrorRatePerCycle extends LocusWalker { public void initialize() { report = new GATKReport(); - report.addTable(reportName, reportDescription, 6, GATKReportTable.TableSortingWay.SORT_BY_ROW); + report.addTable(reportName, reportDescription, 6); table = report.getTable(reportName); table.addColumn("readgroup"); table.addColumn("cycle"); diff --git a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java index 6dad128fe..91efd1ffd 100644 --- a/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java +++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/varianteval/VariantEvalReportWriter.java @@ -162,7 +162,7 @@ public class VariantEvalReportWriter { // create the table final String tableName = ve.getSimpleName(); final String tableDesc = ve.getClass().getAnnotation(Analysis.class).description(); - report.addTable(tableName, tableDesc, 1 + stratifiers.size() + (scanner.hasMoltenField() ? 2 : datamap.size()), GATKReportTable.TableSortingWay.SORT_BY_ROW); + report.addTable(tableName, tableDesc, 1 + stratifiers.size() + (scanner.hasMoltenField() ? 2 : datamap.size())); // grab the table, and add the columns we need to it final GATKReportTable table = report.getTable(tableName); diff --git a/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java b/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java index fc942499c..5cf16dc9f 100644 --- a/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java +++ b/public/java/src/org/broadinstitute/sting/utils/recalibration/QuantizationInfo.java @@ -67,9 +67,9 @@ public class QuantizationInfo { return quantizationLevels; } - public GATKReportTable generateReportTable(boolean sortBycols) { + public GATKReportTable generateReportTable(boolean sortByCols) { GATKReportTable quantizedTable; - if(sortBycols) { + if(sortByCols) { quantizedTable = new GATKReportTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map", 3, GATKReportTable.TableSortingWay.SORT_BY_COLUMN); } else { quantizedTable = new GATKReportTable(RecalUtils.QUANTIZED_REPORT_TABLE_TITLE, "Quality quantization map", 3); diff --git a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java index 40d8d8ff9..0637e9a25 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/report/GATKReportUnitTest.java @@ -105,7 +105,7 @@ public class GATKReportUnitTest extends BaseTest { private boolean isSorted(GATKReportTable table) { boolean result = true; - File testingSortingTableFile = new File("myFile.txt"); + File testingSortingTableFile = new File("testSortingFile.txt"); try { // Connect print stream to the output stream From ab5526b3729cf481b96f1ca69cc399a1c396ca75 Mon Sep 17 00:00:00 2001 From: Joel Thibault Date: Fri, 4 Jan 2013 13:24:57 -0500 Subject: [PATCH 18/18] More TODOs --- .../sting/gatk/traversals/TraverseActiveRegionsUnitTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java index 5051bc35f..0ec4f57f6 100644 --- a/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java +++ b/public/java/test/org/broadinstitute/sting/gatk/traversals/TraverseActiveRegionsUnitTest.java @@ -126,10 +126,10 @@ public class TraverseActiveRegionsUnitTest extends BaseTest { // TODO: reads which span many regions // TODO: reads which are partially between intervals (in/outside extension) // TODO: duplicate reads - + // TODO: read at the end of a contig // TODO: reads which are completely outside intervals but within extension // TODO: test the extension itself - + // TODO: unmapped reads intervals = new ArrayList(); intervals.add(genomeLocParser.createGenomeLoc("1", 10, 20));