diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/BasicReferenceOrderedDatum.java b/java/src/org/broadinstitute/sting/gatk/refdata/BasicReferenceOrderedDatum.java index 44e3d5c2e..a4ac03b19 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/BasicReferenceOrderedDatum.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/BasicReferenceOrderedDatum.java @@ -28,7 +28,7 @@ public abstract class BasicReferenceOrderedDatum implements ReferenceOrderedDatu public String toSimpleString() { return toString(); } public String repl() { return this.toString(); } - public String delimiter() { + public String delimiterRegex() { return "\t"; } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java index 3602c1834..c99e3549c 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java @@ -133,7 +133,7 @@ public class ReferenceOrderedData implements this.type = type; this.name = name; this.header = initializeROD(name, file, type); - this.fieldDelimiter = newROD(name, type).delimiter(); + this.fieldDelimiter = newROD(name, type).delimiterRegex(); } public String getName() { return name; } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java index 8061a7a27..c2fa95090 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java @@ -24,7 +24,7 @@ public interface ReferenceOrderedDatum extends Comparable * Used by the ROD system to determine how to split input lines * @return Regex string delimiter separating fields */ - public String delimiter(); + public String delimiterRegex(); public GenomeLoc getLocation(); public int compareTo( ReferenceOrderedDatum that ); diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/TabularROD.java b/java/src/org/broadinstitute/sting/gatk/refdata/TabularROD.java index 4b49c071e..cb247d795 100755 --- a/java/src/org/broadinstitute/sting/gatk/refdata/TabularROD.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/TabularROD.java @@ -20,7 +20,23 @@ import org.apache.log4j.Logger; * User: mdepristo * Date: Feb 27, 2009 * Time: 10:47:14 AM - * To change this template use File | Settings | File Templates. + * + * System for interacting with tabular formatted data of the following format: + * + * # comment line + * # must include HEADER KEYWORD + * HEADER COL1 ... COLN + * chr:pos data1 ... dataN + * + * The system supports the rod interface. You can just access tabularRODs through the normal ROD system. + * + * You can also write your own files, as such: + * + * ArrayList header = new ArrayList(Arrays.asList("HEADER", "col1", "col2", "col3")); + * assertTrue(TabularROD.headerString(header).equals("HEADER\tcol1\tcol2\tcol3")); + * String rowData = String.format("%d %d %d", 1, 2, 3); + * TabularROD row = new TabularROD("myName", header, new GenomeLoc("chrM", 1), rowData.split(" ")); + * assertTrue(row.toString().equals("chrM:1\t1\t2\t3")); */ public class TabularROD extends BasicReferenceOrderedDatum implements Map { private static Logger logger = Logger.getLogger(TabularROD.class); @@ -29,6 +45,59 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map attributes; private ArrayList header; + public static String DEFAULT_DELIMITER = "\t"; + public static String DEFAULT_DELIMITER_REGEX = "\\s+"; + + public static String DELIMITER = DEFAULT_DELIMITER; + public static String DELIMITER_REGEX = DEFAULT_DELIMITER_REGEX; + + private static int MAX_LINES_TO_LOOK_FOR_HEADER = 1000; + private static Pattern HEADER_PATTERN = Pattern.compile("^\\s*HEADER.*"); + private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); + + /** + * Set the global tabular ROD delimiter and the regex to split the delimiter. + * + * The delimiter to put between fields, while the regex is used to split lines + * + * @param delimiter + * @param delimeterRegex + */ + public static void setDelimiter(final String delimiter, final String delimeterRegex) { + DELIMITER = delimiter; + DELIMITER_REGEX = delimeterRegex; + } + + /** + * Returns a parsable string representation for the + * @param header + */ + public static String headerString(final ArrayList header) { + requireGoodHeader(header); + return Utils.join(DELIMITER, header); + } + + /** + * Returns a comment line containing the *single line* string msg + * + * @param msg + * @return + */ + public static String commentString(final String msg) { + return "# " + msg; + } + + private static boolean headerIsGood(final ArrayList header) { + if ( header.size() == 0 ) return false; + if ( ! header.get(0).equals("HEADER") ) return false; + return true; + } + + private static void requireGoodHeader(final ArrayList header) { + if ( ! headerIsGood(header) ) + throw new RuntimeException("Header must begin with HEADER keyword"); + } + // ---------------------------------------------------------------------- // // Constructors @@ -39,10 +108,41 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map(); } - public TabularROD(final String name, ArrayList header) { - super(name); - attributes = new HashMap(); + /** + * Make a new TabularROD with name, using header columns header, at loc, without any bound data. Data + * must be bound to each corresponding header[i] field before the object is really usable. + * + * @param name + * @param header + * @param loc + */ + public TabularROD(final String name, ArrayList header, GenomeLoc loc) { + this(name); this.header = header; + this.loc = loc; + requireGoodHeader(this.header); + } + + /** + * Make a new TabularROD with name, using header columns header, at loc, with data associated with the + * header columns. data and header are assumed to be in the same order, and bindings will be established + * from header[i] = data[i]. The TabularROD at this stage can be printed, manipulated, it is considered + * a full fledged, initialized object. + * + * @param name + * @param header + * @param loc + * @param data + */ + public TabularROD(final String name, ArrayList header, GenomeLoc loc, String[] data) { + this(name, header, loc); + + if ( header.size() != data.length + 1 ) + throw new RuntimeException(String.format("Incorrect tabular data format: header has %d columns but %d data elements were provided: %s", + header.size(), data.length, Utils.join("\t", data))); + for ( int i = 0; i < data.length; i++ ) { + put(header.get(i+1), data[i]); + } } /** @@ -61,7 +161,8 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map(Arrays.asList(line.split("\\s+"))); + header = new ArrayList(Arrays.asList(line.split(DELIMITER_REGEX))); + //System.out.printf("HEADER IS %s%n", Utils.join(":", header)); } if ( linesLookedAt++ > MAX_LINES_TO_LOOK_FOR_HEADER ) @@ -75,23 +176,22 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map getHeader() { + return header; + } + public String get(final Object key) { return attributes.get(key); } @@ -113,7 +213,7 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map)(headerObj); diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/TabularRODTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/TabularRODTest.java index 053df9cae..03211226d 100755 --- a/java/test/org/broadinstitute/sting/gatk/refdata/TabularRODTest.java +++ b/java/test/org/broadinstitute/sting/gatk/refdata/TabularRODTest.java @@ -12,8 +12,12 @@ import org.broadinstitute.sting.utils.RefHanger; import org.broadinstitute.sting.utils.GenomeLoc; import java.io.File; +import java.io.PrintStream; +import java.io.FileOutputStream; +import java.io.FileNotFoundException; import java.util.Arrays; import java.util.List; +import java.util.ArrayList; /** * Basic unit test for TabularROD @@ -23,8 +27,7 @@ public class TabularRODTest extends BaseTest { private static FastaSequenceFile2 seq; private ReferenceOrderedData ROD; private ReferenceOrderedData.RODIterator iter; - private ReferenceOrderedData ROD2; - private ReferenceOrderedData.RODIterator iter2; + @BeforeClass public static void init() { @@ -35,13 +38,11 @@ public class TabularRODTest extends BaseTest { @Before public void setupTabularROD() { + TabularROD.setDelimiter(TabularROD.DEFAULT_DELIMITER, TabularROD.DEFAULT_DELIMITER_REGEX); File file = new File(testDir + "TabularDataTest.dat"); ROD = new ReferenceOrderedData("tableTest", file, TabularROD.class); iter = ROD.iterator(); - - File file2 = new File(testDir + "TabularDataTest2.dat"); - ROD2 = new ReferenceOrderedData("tableTest", file2, TabularROD.class); - iter2 = ROD2.iterator(); + } @Test @@ -107,10 +108,15 @@ public class TabularRODTest extends BaseTest { assertTrue(one.toString().equals("chrM:10\tA\tB\tC")); } - @Test - public void test2p1() { - logger.warn("Executing test2p1"); - TabularROD one2 = (TabularROD)iter2.next(); + // Didn't change the delimiter + @Test (expected = RuntimeException.class) + public void testDelim1() { + File file2 = new File(testDir + "TabularDataTest2.dat"); + ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", file2, TabularROD.class); + ReferenceOrderedData.RODIterator iter_commas = ROD_commas.iterator(); + + logger.warn("Executing testDelim1"); + TabularROD one2 = (TabularROD)iter_commas.next(); assertTrue(one2.size() == 4); assertTrue(one2.getLocation().equals(new GenomeLoc("chrM", 10))); assertTrue(one2.get("COL1").equals("A")); @@ -118,4 +124,92 @@ public class TabularRODTest extends BaseTest { assertTrue(one2.get("COL3").equals("C")); assertTrue(one2.get("COL4").equals("1")); } + + @Test + public void testDelim2() { + TabularROD.setDelimiter(",",","); + File file2 = new File(testDir + "TabularDataTest2.dat"); + ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", file2, TabularROD.class); + ReferenceOrderedData.RODIterator iter_commas = ROD_commas.iterator(); + + logger.warn("Executing testDelim1"); + TabularROD one2 = (TabularROD)iter_commas.next(); + assertTrue(one2.size() == 4); + assertTrue(one2.getLocation().equals(new GenomeLoc("chrM", 10))); + assertTrue(one2.get("COL1").equals("A")); + assertTrue(one2.get("COL2").equals("B")); + assertTrue(one2.get("COL3").equals("C")); + assertTrue(one2.get("COL4").equals("1")); + } + + @Test + public void testCreation() { + logger.warn("Executing testCreation"); + ArrayList header = new ArrayList(Arrays.asList("HEADER", "col1", "col2", "col3")); + assertTrue(TabularROD.headerString(header).equals("HEADER\tcol1\tcol2\tcol3")); + String rowData = String.format("%d %d %d", 1, 2, 3); + TabularROD row = new TabularROD("myName", header, new GenomeLoc("chrM", 1), rowData.split(" ")); + assertTrue(row.toString().equals("chrM:1\t1\t2\t3")); + } + + @Test + public void testCreationAndWriting() throws FileNotFoundException { + logger.warn("Executing testCreationAndWriting"); + + File outputFile = new File(testDir + "testTabularRodOutputTemp.dat"); + PrintStream out = new PrintStream(new FileOutputStream(outputFile)); + + ArrayList header = new ArrayList(Arrays.asList("HEADER", "col1", "col2", "col3")); + out.println(TabularROD.commentString("Hello, created from test")); + out.println(TabularROD.commentString("")); + out.println(TabularROD.headerString(header)); + + String rowData = String.format("%d %d %d", 1, 2, 3); + TabularROD row = new TabularROD("myName", header, new GenomeLoc("chrM", 1), rowData.split(" ")); + out.println(row.toString()); + + rowData = String.format("%d %d %d", 3, 4, 5); + row = new TabularROD("myName", header, new GenomeLoc("chrM", 2), rowData.split(" ")); + out.println(row.toString()); + + ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", outputFile, TabularROD.class); + ReferenceOrderedData.RODIterator iter_commas = ROD_commas.iterator(); + + TabularROD one = (TabularROD)iter_commas.next(); + assertTrue(one.size() == 3); + assertTrue(one.getLocation().equals(new GenomeLoc("chrM", 1))); + assertTrue(one.get("col1").equals("1")); + assertTrue(one.get("col2").equals("2")); + assertTrue(one.get("col3").equals("3")); + + TabularROD two = (TabularROD)iter_commas.next(); + assertTrue(two.size() == 3); + assertTrue(two.getLocation().equals(new GenomeLoc("chrM", 2))); + assertTrue(two.get("col1").equals("3")); + assertTrue(two.get("col2").equals("4")); + assertTrue(two.get("col3").equals("5")); + } + + @Test (expected=RuntimeException.class ) + public void testBadHeader1() { + logger.warn("Executing testBadHeader1"); + ArrayList header = new ArrayList(); + TabularROD row = new TabularROD("myName", header, new GenomeLoc("chrM", 1)); + } + + @Test (expected=RuntimeException.class ) + public void testBadHeader2() { + logger.warn("Executing testBadHeader2"); + ArrayList header = new ArrayList(Arrays.asList("col1", "col2", "col3")); + TabularROD row = new TabularROD("myName", header, new GenomeLoc("chrM", 1)); + } + + @Test (expected=RuntimeException.class ) + public void testBadData1() { + logger.warn("Executing testBadData1"); + ArrayList header = new ArrayList(Arrays.asList("HEADER", "col1", "col2", "col3")); + assertTrue(TabularROD.headerString(header).equals("HEADER\tcol1\tcol2\tcol3")); + String rowData = String.format("%d %d %d %d", 1, 2, 3, 4); + TabularROD row = new TabularROD("myName", header, new GenomeLoc("chrM", 1), rowData.split(" ")); + } } \ No newline at end of file diff --git a/testdata/TabularDataTest2.dat b/testdata/TabularDataTest2.dat index 8d8768b53..ac548ccf8 100755 --- a/testdata/TabularDataTest2.dat +++ b/testdata/TabularDataTest2.dat @@ -1,7 +1,7 @@ # comment # comment line # -HEADER COL1 COL2 COL3 COL4 -chrM:10 A B C 1 -chrM:20 C D E 2 -chrM:30 F G H 3 \ No newline at end of file +HEADER,COL1,COL2,COL3,COL4 +chrM:10,A,B,C,1 +chrM:20,C,D,E,2 +chrM:30,F,G,H,3 \ No newline at end of file