Merge branch 'master' of ssh://gsa1/humgen/gsa-scr1/gsa-engineering/git/unstable

This commit is contained in:
Matt Hanna 2012-02-09 11:26:48 -05:00
commit aa097a83d5
4 changed files with 134 additions and 357 deletions

View File

@ -24,7 +24,6 @@ public class GATKReport {
/**
* Create a new GATKReport with the contents of a GATKReport on disk.
*
* @param filename the path to the file to load
*/
public GATKReport(String filename) {
@ -33,7 +32,6 @@ public class GATKReport {
/**
* Create a new GATKReport with the contents of a GATKReport on disk.
*
* @param file the file to load
*/
public GATKReport(File file) {
@ -42,7 +40,6 @@ public class GATKReport {
/**
* Load a GATKReport file from disk
*
* @param file the file to load
*/
private void loadReport(File file) {
@ -51,11 +48,12 @@ public class GATKReport {
GATKReportTable table = null;
String[] header = null;
int id = 0;
GATKReportVersion version = null;
List<Integer> columnStarts = null;
String line;
while ((line = reader.readLine()) != null) {
while ( (line = reader.readLine()) != null ) {
if (line.startsWith(GATKREPORT_HEADER_PREFIX)) {
@ -73,7 +71,7 @@ public class GATKReport {
header = null;
columnStarts = null;
} else if (line.trim().isEmpty()) {
} else if ( line.trim().isEmpty() ) {
// do nothing
} else {
if (table != null) {
@ -99,22 +97,19 @@ public class GATKReport {
if (header == null) {
header = splitLine;
// Set the first column as the primary key
table.addPrimaryKey(header[0]);
// Set every other column as column
for (int i = 1; i < header.length; i++) {
table.addColumn(header[i], "");
table.addPrimaryKey("id", false);
for ( String columnName : header ) {
table.addColumn(columnName, "");
}
id = 0;
} else {
//Get primary key Value from the current line array
String primaryKey = splitLine[0];
//Input all the remaining values
for (int columnIndex = 1; columnIndex < header.length; columnIndex++) {
table.set(primaryKey, header[columnIndex], splitLine[columnIndex]);
for (int columnIndex = 0; columnIndex < header.length; columnIndex++) {
table.set(id, header[columnIndex], splitLine[columnIndex]);
}
id++;
}
}
}
@ -180,24 +175,4 @@ public class GATKReport {
public Collection<GATKReportTable> getTables() {
return tables.values();
}
public void combineWith(GATKReport input) {
// For every input table, add values
System.out.println("This.tables: keySet");
for (String s : tables.keySet())
System.out.println(s);
// todo test tables exist
for (String tableName : input.tables.keySet()) {
System.out.println("Input table key: " + tableName);
if (tables.containsKey(tableName))
tables.get(tableName).mergeRows(input.getTable(tableName));
else
throw new ReviewedStingException("Failed to combine GATKReport, tables don't match!");
}
}
}

View File

@ -1,46 +0,0 @@
package org.broadinstitute.sting.gatk.report;
import org.broadinstitute.sting.commandline.Gatherer;
import org.broadinstitute.sting.utils.exceptions.UserException;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.PrintStream;
import java.util.List;
/**
* Created by IntelliJ IDEA.
* User: roger
* Date: 1/9/12
* Time: 11:17 PM
* To change this template use File | Settings | File Templates.
*/
public class GATKReportGatherer extends Gatherer {
@Override
public void gather(List<File> inputs, File output) {
//Combines inputs GATKReport to one output
PrintStream o;
try {
o = new PrintStream(output);
} catch (FileNotFoundException e) {
throw new UserException("File to be output by CoverageByRG Gather function was not found");
}
GATKReport current = new GATKReport();
boolean isFirst = true;
for (File input : inputs) {
// If the table is empty
if (isFirst) {
current = new GATKReport(input);
isFirst = false;
} else {
GATKReport toAdd = new GATKReport(input);
current.combineWith(toAdd);
}
}
current.print(o);
}
}

View File

@ -4,10 +4,7 @@ import org.apache.commons.lang.ObjectUtils;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import java.io.PrintStream;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.TreeSet;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@ -15,12 +12,12 @@ import java.util.regex.Pattern;
* A data structure that allows data to be collected over the course of a walker's computation, then have that data
* written to a PrintStream such that it's human-readable, AWK-able, and R-friendly (given that you load it using the
* GATKReport loader module).
* <p/>
*
* The goal of this object is to use the same data structure for both accumulating data during a walker's computation
* and emitting that data to a file for easy analysis in R (or any other program/language that can take in a table of
* results). Thus, all of the infrastructure below is designed simply to make printing the following as easy as
* possible:
* <p/>
*
* ##:GATKReport.v0.1 ErrorRatePerCycle : The error rate per sequenced position in the reads
* cycle errorrate.61PA8.7 qualavg.61PA8.7
* 0 0.007451835696110506 25.474613284804366
@ -32,60 +29,60 @@ import java.util.regex.Pattern;
* 6 5.452562704471102E-4 36.1217248908297
* 7 5.452562704471102E-4 36.1910480349345
* 8 5.452562704471102E-4 36.00345705967977
* <p/>
*
* Here, we have a GATKReport table - a well-formatted, easy to read representation of some tabular data. Every single
* table has this same GATKReport.v0.1 header, which permits multiple files from different sources to be cat-ed
* together, which makes it very easy to pull tables from different programs into R via a single file.
* <p/>
*
* ------------
* Definitions:
* <p/>
*
* Table info:
* The first line, structured as
* ##:<report version> <table name> : <table description>
* <p/>
*
* Table header:
* The second line, specifying a unique name for each column in the table.
* <p/>
*
* The first column mentioned in the table header is the "primary key" column - a column that provides the unique
* identifier for each row in the table. Once this column is created, any element in the table can be referenced by
* the row-column coordinate, i.e. "primary key"-"column name" coordinate.
* <p/>
*
* When a column is added to a table, a default value must be specified (usually 0). This is the initial value for
* an element in a column. This permits operations like increment() and decrement() to work properly on columns that
* are effectively counters for a particular event.
* <p/>
*
* Finally, the display property for each column can be set during column creation. This is useful when a given
* column stores an intermediate result that will be used later on, perhaps to calculate the value of another column.
* In these cases, it's obviously necessary to store the value required for further computation, but it's not
* necessary to actually print the intermediate column.
* <p/>
*
* Table body:
* The values of the table itself.
* <p/>
*
* ---------------
* Implementation:
* <p/>
*
* The implementation of this table has two components:
* 1. A TreeSet<Object> that stores all the values ever specified for the primary key. Any get() operation that
* refers to an element where the primary key object does not exist will result in its implicit creation. I
* haven't yet decided if this is a good idea...
* <p/>
*
* 2. A HashMap<String, GATKReportColumn> that stores a mapping from column name to column contents. Each
* GATKReportColumn is effectively a map (in fact, GATKReportColumn extends TreeMap<Object, Object>) between
* primary key and the column value. This means that, given N columns, the primary key information is stored
* N+1 times. This is obviously wasteful and can likely be handled much more elegantly in future implementations.
* <p/>
*
* ------------------------------
* Element and column operations:
* <p/>
*
* In addition to simply getting and setting values, this object also permits some simple operations to be applied to
* individual elements or to whole columns. For instance, an element can be easily incremented without the hassle of
* calling get(), incrementing the obtained value by 1, and then calling set() with the new value. Also, some vector
* operations are supported. For instance, two whole columns can be divided and have the result be set to a third
* column. This is especially useful when aggregating counts in two intermediate columns that will eventually need to
* be manipulated row-by-row to compute the final column.
* <p/>
*
* Note: I've made no attempt whatsoever to make these operations efficient. Right now, some of the methods check the
* type of the stored object using an instanceof call and attempt to do the right thing. Others cast the contents of
* the cell to a Number, call the Number.toDouble() method and compute a result. This is clearly not the ideal design,
@ -95,9 +92,7 @@ import java.util.regex.Pattern;
* @author Khalid Shakir
*/
public class GATKReportTable {
/**
* REGEX that matches any table with an invalid name
*/
/** REGEX that matches any table with an invalid name */
public final static String INVALID_TABLE_NAME_REGEX = "[^a-zA-Z0-9_\\-\\.]";
private static final GATKReportVersion LATEST_REPORT_VERSION = GATKReportVersion.V0_2;
private String tableName;
@ -200,7 +195,6 @@ public class GATKReportTable {
/**
* Returns the first primary key matching the dotted column values.
* Ex: dbsnp.eval.called.all.novel.all
*
* @param dottedColumnValues Period concatenated values.
* @return The first primary key matching the column values or throws an exception.
*/
@ -214,7 +208,6 @@ public class GATKReportTable {
/**
* Returns true if there is at least on row with the dotted column values.
* Ex: dbsnp.eval.called.all.novel.all
*
* @param dottedColumnValues Period concatenated values.
* @return true if there is at least one row matching the columns.
*/
@ -225,7 +218,6 @@ public class GATKReportTable {
/**
* Returns the first primary key matching the dotted column values.
* Ex: dbsnp.eval.called.all.novel.all
*
* @param dottedColumnValues Period concatenated values.
* @return The first primary key matching the column values or null.
*/
@ -236,7 +228,6 @@ public class GATKReportTable {
/**
* Returns the first primary key matching the column values.
* Ex: new String[] { "dbsnp", "eval", "called", "all", "novel", "all" }
*
* @param columnValues column values.
* @return The first primary key matching the column values.
*/
@ -244,7 +235,7 @@ public class GATKReportTable {
for (Object primaryKey : primaryKeyColumn) {
boolean matching = true;
for (int i = 0; matching && i < columnValues.length; i++) {
matching = ObjectUtils.equals(columnValues[i], get(primaryKey, i + 1));
matching = ObjectUtils.equals(columnValues[i], get(primaryKey, i+1));
}
if (matching)
return primaryKey;
@ -265,7 +256,6 @@ public class GATKReportTable {
public void addColumn(String columnName, Object defaultValue, String format) {
addColumn(columnName, defaultValue, true, format);
}
/**
* Add a column to the report, specify the default column value, and specify whether the column should be displayed in the final output (useful when intermediate columns are necessary for later calculations, but are not required to be in the output file.
*
@ -595,11 +585,10 @@ public class GATKReportTable {
/**
* Return the print width of the primary key column
*
* @return the width of the primary key column
*/
public int getPrimaryKeyColumnWidth() {
int maxWidth = getPrimaryKeyName().length();
int maxWidth = primaryKeyName.length();
for (Object primaryKey : primaryKeyColumn) {
int width = primaryKey.toString().length();
@ -631,15 +620,13 @@ public class GATKReportTable {
// Emit the table header, taking into account the padding requirement if the primary key is a hidden column
boolean needsPadding = false;
if (primaryKeyDisplay) {
out.printf(primaryKeyFormat, getPrimaryKeyName());
out.printf(primaryKeyFormat, primaryKeyName);
needsPadding = true;
}
for (String columnName : columns.keySet()) {
if (columns.get(columnName).isDisplayable()) {
if (needsPadding) {
out.printf(" ");
}
if (needsPadding) { out.printf(" "); }
out.printf(columnFormats.get(columnName).getNameFormat(), columnName);
needsPadding = true;
@ -658,9 +645,7 @@ public class GATKReportTable {
for (String columnName : columns.keySet()) {
if (columns.get(columnName).isDisplayable()) {
if (needsPadding) {
out.printf(" ");
}
if (needsPadding) { out.printf(" "); }
String value = columns.get(columnName).getStringValue(primaryKey);
out.printf(columnFormats.get(columnName).getValueFormat(), value);
@ -690,49 +675,4 @@ public class GATKReportTable {
public GATKReportColumns getColumns() {
return columns;
}
public void mergeRows(GATKReportTable input) {
/*
* This function is different from addRowsFrom because we will add the ability to sum,average, etc rows
* TODO: Add other combining algorithms
*/
// Make sure the columns match AND the Primary Key
if (input.getColumns().keySet().equals(this.getColumns().keySet()) &&
input.getPrimaryKeyName().equals(this.getPrimaryKeyName())) {
this.addRowsFrom(input);
} else
throw new ReviewedStingException("Failed to combine GATKReportTable, columns don't match!");
}
public void addRowsFrom(GATKReportTable input) {
// add column by column
// For every column
for (String columnKey : input.getColumns().keySet()) {
GATKReportColumn current = this.getColumns().get(columnKey);
GATKReportColumn toAdd = input.getColumns().get(columnKey);
// We want to take the current column and add all the values from input
// The column is a map of values <Key, Value>
for (Object rowKey : toAdd.keySet()) {
// We add every value from toAdd to the current
if (!current.containsKey(rowKey)) {
this.set(rowKey, columnKey, toAdd.get(rowKey));
System.out.printf("Putting row with PK: %s \n", rowKey);
} else {
// TODO we should be able to handle combining data by adding, averaging, etc.
this.set(rowKey, columnKey, toAdd.get(rowKey));
System.out.printf("OVERWRITING Row with PK: %s \n", rowKey);
}
}
}
}
public String getPrimaryKeyName() {
return primaryKeyName;
}
}

View File

@ -49,23 +49,23 @@ public class GATKReportUnitTest extends BaseTest {
@DataProvider(name = "rightAlignValues")
public Object[][] getRightAlignValues() {
return new Object[][]{
new Object[]{null, true},
new Object[]{"null", true},
new Object[]{"NA", true},
new Object[]{"0", true},
new Object[]{"0.0", true},
new Object[]{"-0", true},
new Object[]{"-0.0", true},
new Object[]{String.valueOf(Long.MAX_VALUE), true},
new Object[]{String.valueOf(Long.MIN_VALUE), true},
new Object[]{String.valueOf(Float.MIN_NORMAL), true},
new Object[]{String.valueOf(Double.MAX_VALUE), true},
new Object[]{String.valueOf(Double.MIN_VALUE), true},
new Object[]{String.valueOf(Double.POSITIVE_INFINITY), true},
new Object[]{String.valueOf(Double.NEGATIVE_INFINITY), true},
new Object[]{String.valueOf(Double.NaN), true},
new Object[]{"hello", false}
return new Object[][] {
new Object[] {null, true},
new Object[] {"null", true},
new Object[] {"NA", true},
new Object[] {"0", true},
new Object[] {"0.0", true},
new Object[] {"-0", true},
new Object[] {"-0.0", true},
new Object[] {String.valueOf(Long.MAX_VALUE), true},
new Object[] {String.valueOf(Long.MIN_VALUE), true},
new Object[] {String.valueOf(Float.MIN_NORMAL), true},
new Object[] {String.valueOf(Double.MAX_VALUE), true},
new Object[] {String.valueOf(Double.MIN_VALUE), true},
new Object[] {String.valueOf(Double.POSITIVE_INFINITY), true},
new Object[] {String.valueOf(Double.NEGATIVE_INFINITY), true},
new Object[] {String.valueOf(Double.NaN), true},
new Object[] {"hello", false}
};
}
@ -73,96 +73,4 @@ public class GATKReportUnitTest extends BaseTest {
public void testIsRightAlign(String value, boolean expected) {
Assert.assertEquals(GATKReportColumn.isRightAlign(value), expected, "right align of '" + value + "'");
}
@Test
public void testGATKReportGatherer() {
/*
GATKReportTable actual1 = new GATKReportTable("TableName", "Description");
actual1.addPrimaryKey("key");
actual1.addColumn("colA", 0);
actual1.addColumn("colB", 0);
actual1.set("row1", "colA", 1);
actual1.set("row1", "colB", 2);
GATKReportTable actual2 = new GATKReportTable("TableName", "Description");
actual2.addPrimaryKey("key");
actual2.addColumn("colA", 0);
actual2.addColumn("colB", 0);
actual2.set("row2", "colA", 3);
actual2.set("row2", "colB", 4);
GATKReportTable actual3 = new GATKReportTable("TableName", "Description");
actual3.addPrimaryKey("key");
actual3.addColumn("colA", 0);
actual3.addColumn("colB", 0);
actual3.set("row3", "colA", 5);
actual3.set("row3", "colB", 6);
actual1.mergeRows(actual2);
actual1.mergeRows(actual3);
actual1.write(System.out);
*/
GATKReportTable expected = new GATKReportTable("TableName", "Description");
expected.addPrimaryKey("key");
expected.addColumn("colA", 0);
expected.addColumn("colB", 0);
expected.set("row1", "colA", 1);
expected.set("row1", "colB", 2);
expected.set("row2", "colA", 3);
expected.set("row2", "colB", 4);
expected.set("row3", "colA", 5);
expected.set("row3", "colB", 6);
expected.write(System.out);
GATKReport report1, report2, report3;
report1 = new GATKReport();
report1.addTable("TableName", "Description");
report1.getTable("TableName").addPrimaryKey("key");
report1.getTable("TableName").addColumn("colA", 0);
report1.getTable("TableName").addColumn("colB", 0);
report1.getTable("TableName").set("row1", "colA", 1);
report1.getTable("TableName").set("row1", "colB", 2);
report2 = new GATKReport();
report2.addTable("TableName", "Description");
report2.getTable("TableName").addPrimaryKey("key");
report2.getTable("TableName").addColumn("colA", 0);
report2.getTable("TableName").addColumn("colB", 0);
report2.getTable("TableName").set("row2", "colA", 3);
report2.getTable("TableName").set("row2", "colB", 4);
report3 = new GATKReport();
report3.addTable("TableName", "Description");
report3.getTable("TableName").addPrimaryKey("key");
report3.getTable("TableName").addColumn("colA", 0);
report3.getTable("TableName").addColumn("colB", 0);
report3.getTable("TableName").set("row3", "colA", 5);
report3.getTable("TableName").set("row3", "colB", 6);
report1.combineWith(report2);
report1.combineWith(report3);
report1.print(System.out);
/*
File a = new File("/home/roger/tbls/a.tbl");
File b = new File("/home/roger/tbls/b.tbl");
File c = new File("/home/roger/tbls/c.tbl");
File out = new File("/home/roger/tbls/out.tbl");
List<File> FileList = new ArrayList<File>();
FileList.add(a);
FileList.add(b);
FileList.add(c);
GATKReportGatherer gatherer = new GATKReportGatherer();
gatherer.gather(FileList, out);
System.out.print(out);
*/
//Assert.assertEquals(1,1);
}
}