Better interface to the tabular ROD, now makes writing files easier. Also has corresponding test files

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@719 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-05-14 23:20:11 +00:00
parent 50f32b7f61
commit 7834b969b4
6 changed files with 239 additions and 31 deletions

View File

@ -28,7 +28,7 @@ public abstract class BasicReferenceOrderedDatum implements ReferenceOrderedDatu
public String toSimpleString() { return toString(); }
public String repl() { return this.toString(); }
public String delimiter() {
public String delimiterRegex() {
return "\t";
}

View File

@ -133,7 +133,7 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
this.type = type;
this.name = name;
this.header = initializeROD(name, file, type);
this.fieldDelimiter = newROD(name, type).delimiter();
this.fieldDelimiter = newROD(name, type).delimiterRegex();
}
public String getName() { return name; }

View File

@ -24,7 +24,7 @@ public interface ReferenceOrderedDatum extends Comparable<ReferenceOrderedDatum>
* Used by the ROD system to determine how to split input lines
* @return Regex string delimiter separating fields
*/
public String delimiter();
public String delimiterRegex();
public GenomeLoc getLocation();
public int compareTo( ReferenceOrderedDatum that );

View File

@ -20,7 +20,23 @@ import org.apache.log4j.Logger;
* User: mdepristo
* Date: Feb 27, 2009
* Time: 10:47:14 AM
* To change this template use File | Settings | File Templates.
*
* System for interacting with tabular formatted data of the following format:
*
* # comment line
* # must include HEADER KEYWORD
* HEADER COL1 ... COLN
* chr:pos data1 ... dataN
*
* The system supports the rod interface. You can just access tabularRODs through the normal ROD system.
*
* You can also write your own files, as such:
*
* ArrayList<String> header = new ArrayList<String>(Arrays.asList("HEADER", "col1", "col2", "col3"));
* assertTrue(TabularROD.headerString(header).equals("HEADER\tcol1\tcol2\tcol3"));
* String rowData = String.format("%d %d %d", 1, 2, 3);
* TabularROD row = new TabularROD("myName", header, new GenomeLoc("chrM", 1), rowData.split(" "));
* assertTrue(row.toString().equals("chrM:1\t1\t2\t3"));
*/
public class TabularROD extends BasicReferenceOrderedDatum implements Map<String, String> {
private static Logger logger = Logger.getLogger(TabularROD.class);
@ -29,6 +45,59 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map<String
private HashMap<String, String> attributes;
private ArrayList<String> header;
public static String DEFAULT_DELIMITER = "\t";
public static String DEFAULT_DELIMITER_REGEX = "\\s+";
public static String DELIMITER = DEFAULT_DELIMITER;
public static String DELIMITER_REGEX = DEFAULT_DELIMITER_REGEX;
private static int MAX_LINES_TO_LOOK_FOR_HEADER = 1000;
private static Pattern HEADER_PATTERN = Pattern.compile("^\\s*HEADER.*");
private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
/**
* Set the global tabular ROD delimiter and the regex to split the delimiter.
*
* The delimiter to put between fields, while the regex is used to split lines
*
* @param delimiter
* @param delimeterRegex
*/
public static void setDelimiter(final String delimiter, final String delimeterRegex) {
DELIMITER = delimiter;
DELIMITER_REGEX = delimeterRegex;
}
/**
* Returns a parsable string representation for the
* @param header
*/
public static String headerString(final ArrayList<String> header) {
requireGoodHeader(header);
return Utils.join(DELIMITER, header);
}
/**
* Returns a comment line containing the *single line* string msg
*
* @param msg
* @return
*/
public static String commentString(final String msg) {
return "# " + msg;
}
private static boolean headerIsGood(final ArrayList<String> header) {
if ( header.size() == 0 ) return false;
if ( ! header.get(0).equals("HEADER") ) return false;
return true;
}
private static void requireGoodHeader(final ArrayList<String> header) {
if ( ! headerIsGood(header) )
throw new RuntimeException("Header must begin with HEADER keyword");
}
// ----------------------------------------------------------------------
//
// Constructors
@ -39,10 +108,41 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map<String
attributes = new HashMap<String, String>();
}
public TabularROD(final String name, ArrayList<String> header) {
super(name);
attributes = new HashMap<String, String>();
/**
* Make a new TabularROD with name, using header columns header, at loc, without any bound data. Data
* must be bound to each corresponding header[i] field before the object is really usable.
*
* @param name
* @param header
* @param loc
*/
public TabularROD(final String name, ArrayList<String> header, GenomeLoc loc) {
this(name);
this.header = header;
this.loc = loc;
requireGoodHeader(this.header);
}
/**
* Make a new TabularROD with name, using header columns header, at loc, with data associated with the
* header columns. data and header are assumed to be in the same order, and bindings will be established
* from header[i] = data[i]. The TabularROD at this stage can be printed, manipulated, it is considered
* a full fledged, initialized object.
*
* @param name
* @param header
* @param loc
* @param data
*/
public TabularROD(final String name, ArrayList<String> header, GenomeLoc loc, String[] data) {
this(name, header, loc);
if ( header.size() != data.length + 1 )
throw new RuntimeException(String.format("Incorrect tabular data format: header has %d columns but %d data elements were provided: %s",
header.size(), data.length, Utils.join("\t", data)));
for ( int i = 0; i < data.length; i++ ) {
put(header.get(i+1), data[i]);
}
}
/**
@ -61,7 +161,8 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map<String
Matcher m = HEADER_PATTERN.matcher(line);
if ( m.matches() ) {
//System.out.printf("Found a header line: %s%n", line);
header = new ArrayList<String>(Arrays.asList(line.split("\\s+")));
header = new ArrayList<String>(Arrays.asList(line.split(DELIMITER_REGEX)));
//System.out.printf("HEADER IS %s%n", Utils.join(":", header));
}
if ( linesLookedAt++ > MAX_LINES_TO_LOOK_FOR_HEADER )
@ -75,23 +176,22 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map<String
throw new RuntimeException("Couldn't find header line in TabularROD!");
}
//System.exit(1);
return header;
}
private static int MAX_LINES_TO_LOOK_FOR_HEADER = 1000;
private static Pattern HEADER_PATTERN = Pattern.compile("^\\s*HEADER.*");
private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
// ----------------------------------------------------------------------
//
// Accessors
// ROD accessors
//
// ----------------------------------------------------------------------
public GenomeLoc getLocation() {
return loc;
}
public ArrayList<String> getHeader() {
return header;
}
public String get(final Object key) {
return attributes.get(key);
}
@ -113,7 +213,7 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map<String
for ( String key : header ) {
if ( containsKey(key) ) { // avoid the header
strings.add(this.get(key));
System.out.printf("Adding %s%n", this.get(key));
//System.out.printf("Adding %s%n", this.get(key));
}
}
return Utils.join("\t", strings);
@ -149,10 +249,24 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map<String
return String.format("%s\t%s", getLocation(), getAttributeString());
}
public String delimiter() {
return "\\s+";
/**
* The delimiter regular expression that should be used to separate fields in data rows
* and header.
*
* @return
*/
public String delimiterRegex() {
return DELIMITER_REGEX;
}
/**
* Used by ROD management system to set the data in this ROD associated with a line in a rod
*
* @param headerObj
* @param parts
* @return
* @throws IOException
*/
public boolean parseLine(final Object headerObj, final String[] parts) throws IOException {
header = (ArrayList<String>)(headerObj);

View File

@ -12,8 +12,12 @@ import org.broadinstitute.sting.utils.RefHanger;
import org.broadinstitute.sting.utils.GenomeLoc;
import java.io.File;
import java.io.PrintStream;
import java.io.FileOutputStream;
import java.io.FileNotFoundException;
import java.util.Arrays;
import java.util.List;
import java.util.ArrayList;
/**
* Basic unit test for TabularROD
@ -23,8 +27,7 @@ public class TabularRODTest extends BaseTest {
private static FastaSequenceFile2 seq;
private ReferenceOrderedData ROD;
private ReferenceOrderedData.RODIterator iter;
private ReferenceOrderedData ROD2;
private ReferenceOrderedData.RODIterator iter2;
@BeforeClass
public static void init() {
@ -35,13 +38,11 @@ public class TabularRODTest extends BaseTest {
@Before
public void setupTabularROD() {
TabularROD.setDelimiter(TabularROD.DEFAULT_DELIMITER, TabularROD.DEFAULT_DELIMITER_REGEX);
File file = new File(testDir + "TabularDataTest.dat");
ROD = new ReferenceOrderedData("tableTest", file, TabularROD.class);
iter = ROD.iterator();
File file2 = new File(testDir + "TabularDataTest2.dat");
ROD2 = new ReferenceOrderedData("tableTest", file2, TabularROD.class);
iter2 = ROD2.iterator();
}
@Test
@ -107,10 +108,15 @@ public class TabularRODTest extends BaseTest {
assertTrue(one.toString().equals("chrM:10\tA\tB\tC"));
}
@Test
public void test2p1() {
logger.warn("Executing test2p1");
TabularROD one2 = (TabularROD)iter2.next();
// Didn't change the delimiter
@Test (expected = RuntimeException.class)
public void testDelim1() {
File file2 = new File(testDir + "TabularDataTest2.dat");
ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", file2, TabularROD.class);
ReferenceOrderedData.RODIterator iter_commas = ROD_commas.iterator();
logger.warn("Executing testDelim1");
TabularROD one2 = (TabularROD)iter_commas.next();
assertTrue(one2.size() == 4);
assertTrue(one2.getLocation().equals(new GenomeLoc("chrM", 10)));
assertTrue(one2.get("COL1").equals("A"));
@ -118,4 +124,92 @@ public class TabularRODTest extends BaseTest {
assertTrue(one2.get("COL3").equals("C"));
assertTrue(one2.get("COL4").equals("1"));
}
@Test
public void testDelim2() {
TabularROD.setDelimiter(",",",");
File file2 = new File(testDir + "TabularDataTest2.dat");
ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", file2, TabularROD.class);
ReferenceOrderedData.RODIterator iter_commas = ROD_commas.iterator();
logger.warn("Executing testDelim1");
TabularROD one2 = (TabularROD)iter_commas.next();
assertTrue(one2.size() == 4);
assertTrue(one2.getLocation().equals(new GenomeLoc("chrM", 10)));
assertTrue(one2.get("COL1").equals("A"));
assertTrue(one2.get("COL2").equals("B"));
assertTrue(one2.get("COL3").equals("C"));
assertTrue(one2.get("COL4").equals("1"));
}
@Test
public void testCreation() {
logger.warn("Executing testCreation");
ArrayList<String> header = new ArrayList<String>(Arrays.asList("HEADER", "col1", "col2", "col3"));
assertTrue(TabularROD.headerString(header).equals("HEADER\tcol1\tcol2\tcol3"));
String rowData = String.format("%d %d %d", 1, 2, 3);
TabularROD row = new TabularROD("myName", header, new GenomeLoc("chrM", 1), rowData.split(" "));
assertTrue(row.toString().equals("chrM:1\t1\t2\t3"));
}
@Test
public void testCreationAndWriting() throws FileNotFoundException {
logger.warn("Executing testCreationAndWriting");
File outputFile = new File(testDir + "testTabularRodOutputTemp.dat");
PrintStream out = new PrintStream(new FileOutputStream(outputFile));
ArrayList<String> header = new ArrayList<String>(Arrays.asList("HEADER", "col1", "col2", "col3"));
out.println(TabularROD.commentString("Hello, created from test"));
out.println(TabularROD.commentString(""));
out.println(TabularROD.headerString(header));
String rowData = String.format("%d %d %d", 1, 2, 3);
TabularROD row = new TabularROD("myName", header, new GenomeLoc("chrM", 1), rowData.split(" "));
out.println(row.toString());
rowData = String.format("%d %d %d", 3, 4, 5);
row = new TabularROD("myName", header, new GenomeLoc("chrM", 2), rowData.split(" "));
out.println(row.toString());
ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", outputFile, TabularROD.class);
ReferenceOrderedData.RODIterator iter_commas = ROD_commas.iterator();
TabularROD one = (TabularROD)iter_commas.next();
assertTrue(one.size() == 3);
assertTrue(one.getLocation().equals(new GenomeLoc("chrM", 1)));
assertTrue(one.get("col1").equals("1"));
assertTrue(one.get("col2").equals("2"));
assertTrue(one.get("col3").equals("3"));
TabularROD two = (TabularROD)iter_commas.next();
assertTrue(two.size() == 3);
assertTrue(two.getLocation().equals(new GenomeLoc("chrM", 2)));
assertTrue(two.get("col1").equals("3"));
assertTrue(two.get("col2").equals("4"));
assertTrue(two.get("col3").equals("5"));
}
@Test (expected=RuntimeException.class )
public void testBadHeader1() {
logger.warn("Executing testBadHeader1");
ArrayList<String> header = new ArrayList<String>();
TabularROD row = new TabularROD("myName", header, new GenomeLoc("chrM", 1));
}
@Test (expected=RuntimeException.class )
public void testBadHeader2() {
logger.warn("Executing testBadHeader2");
ArrayList<String> header = new ArrayList<String>(Arrays.asList("col1", "col2", "col3"));
TabularROD row = new TabularROD("myName", header, new GenomeLoc("chrM", 1));
}
@Test (expected=RuntimeException.class )
public void testBadData1() {
logger.warn("Executing testBadData1");
ArrayList<String> header = new ArrayList<String>(Arrays.asList("HEADER", "col1", "col2", "col3"));
assertTrue(TabularROD.headerString(header).equals("HEADER\tcol1\tcol2\tcol3"));
String rowData = String.format("%d %d %d %d", 1, 2, 3, 4);
TabularROD row = new TabularROD("myName", header, new GenomeLoc("chrM", 1), rowData.split(" "));
}
}

View File

@ -1,7 +1,7 @@
# comment
# comment line
#
HEADER COL1 COL2 COL3 COL4
chrM:10 A B C 1
chrM:20 C D E 2
chrM:30 F G H 3
HEADER,COL1,COL2,COL3,COL4
chrM:10,A,B,C,1
chrM:20,C,D,E,2
chrM:30,F,G,H,3