Better interface to the tabular ROD, now makes writing files easier. Also has corresponding test files
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@719 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
50f32b7f61
commit
7834b969b4
|
|
@ -28,7 +28,7 @@ public abstract class BasicReferenceOrderedDatum implements ReferenceOrderedDatu
|
|||
public String toSimpleString() { return toString(); }
|
||||
public String repl() { return this.toString(); }
|
||||
|
||||
public String delimiter() {
|
||||
public String delimiterRegex() {
|
||||
return "\t";
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -133,7 +133,7 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
|||
this.type = type;
|
||||
this.name = name;
|
||||
this.header = initializeROD(name, file, type);
|
||||
this.fieldDelimiter = newROD(name, type).delimiter();
|
||||
this.fieldDelimiter = newROD(name, type).delimiterRegex();
|
||||
}
|
||||
|
||||
public String getName() { return name; }
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ public interface ReferenceOrderedDatum extends Comparable<ReferenceOrderedDatum>
|
|||
* Used by the ROD system to determine how to split input lines
|
||||
* @return Regex string delimiter separating fields
|
||||
*/
|
||||
public String delimiter();
|
||||
public String delimiterRegex();
|
||||
|
||||
public GenomeLoc getLocation();
|
||||
public int compareTo( ReferenceOrderedDatum that );
|
||||
|
|
|
|||
|
|
@ -20,7 +20,23 @@ import org.apache.log4j.Logger;
|
|||
* User: mdepristo
|
||||
* Date: Feb 27, 2009
|
||||
* Time: 10:47:14 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*
|
||||
* System for interacting with tabular formatted data of the following format:
|
||||
*
|
||||
* # comment line
|
||||
* # must include HEADER KEYWORD
|
||||
* HEADER COL1 ... COLN
|
||||
* chr:pos data1 ... dataN
|
||||
*
|
||||
* The system supports the rod interface. You can just access tabularRODs through the normal ROD system.
|
||||
*
|
||||
* You can also write your own files, as such:
|
||||
*
|
||||
* ArrayList<String> header = new ArrayList<String>(Arrays.asList("HEADER", "col1", "col2", "col3"));
|
||||
* assertTrue(TabularROD.headerString(header).equals("HEADER\tcol1\tcol2\tcol3"));
|
||||
* String rowData = String.format("%d %d %d", 1, 2, 3);
|
||||
* TabularROD row = new TabularROD("myName", header, new GenomeLoc("chrM", 1), rowData.split(" "));
|
||||
* assertTrue(row.toString().equals("chrM:1\t1\t2\t3"));
|
||||
*/
|
||||
public class TabularROD extends BasicReferenceOrderedDatum implements Map<String, String> {
|
||||
private static Logger logger = Logger.getLogger(TabularROD.class);
|
||||
|
|
@ -29,6 +45,59 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map<String
|
|||
private HashMap<String, String> attributes;
|
||||
private ArrayList<String> header;
|
||||
|
||||
public static String DEFAULT_DELIMITER = "\t";
|
||||
public static String DEFAULT_DELIMITER_REGEX = "\\s+";
|
||||
|
||||
public static String DELIMITER = DEFAULT_DELIMITER;
|
||||
public static String DELIMITER_REGEX = DEFAULT_DELIMITER_REGEX;
|
||||
|
||||
private static int MAX_LINES_TO_LOOK_FOR_HEADER = 1000;
|
||||
private static Pattern HEADER_PATTERN = Pattern.compile("^\\s*HEADER.*");
|
||||
private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
|
||||
/**
|
||||
* Set the global tabular ROD delimiter and the regex to split the delimiter.
|
||||
*
|
||||
* The delimiter to put between fields, while the regex is used to split lines
|
||||
*
|
||||
* @param delimiter
|
||||
* @param delimeterRegex
|
||||
*/
|
||||
public static void setDelimiter(final String delimiter, final String delimeterRegex) {
|
||||
DELIMITER = delimiter;
|
||||
DELIMITER_REGEX = delimeterRegex;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a parsable string representation for the
|
||||
* @param header
|
||||
*/
|
||||
public static String headerString(final ArrayList<String> header) {
|
||||
requireGoodHeader(header);
|
||||
return Utils.join(DELIMITER, header);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a comment line containing the *single line* string msg
|
||||
*
|
||||
* @param msg
|
||||
* @return
|
||||
*/
|
||||
public static String commentString(final String msg) {
|
||||
return "# " + msg;
|
||||
}
|
||||
|
||||
private static boolean headerIsGood(final ArrayList<String> header) {
|
||||
if ( header.size() == 0 ) return false;
|
||||
if ( ! header.get(0).equals("HEADER") ) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
private static void requireGoodHeader(final ArrayList<String> header) {
|
||||
if ( ! headerIsGood(header) )
|
||||
throw new RuntimeException("Header must begin with HEADER keyword");
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Constructors
|
||||
|
|
@ -39,10 +108,41 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map<String
|
|||
attributes = new HashMap<String, String>();
|
||||
}
|
||||
|
||||
public TabularROD(final String name, ArrayList<String> header) {
|
||||
super(name);
|
||||
attributes = new HashMap<String, String>();
|
||||
/**
|
||||
* Make a new TabularROD with name, using header columns header, at loc, without any bound data. Data
|
||||
* must be bound to each corresponding header[i] field before the object is really usable.
|
||||
*
|
||||
* @param name
|
||||
* @param header
|
||||
* @param loc
|
||||
*/
|
||||
public TabularROD(final String name, ArrayList<String> header, GenomeLoc loc) {
|
||||
this(name);
|
||||
this.header = header;
|
||||
this.loc = loc;
|
||||
requireGoodHeader(this.header);
|
||||
}
|
||||
|
||||
/**
|
||||
* Make a new TabularROD with name, using header columns header, at loc, with data associated with the
|
||||
* header columns. data and header are assumed to be in the same order, and bindings will be established
|
||||
* from header[i] = data[i]. The TabularROD at this stage can be printed, manipulated, it is considered
|
||||
* a full fledged, initialized object.
|
||||
*
|
||||
* @param name
|
||||
* @param header
|
||||
* @param loc
|
||||
* @param data
|
||||
*/
|
||||
public TabularROD(final String name, ArrayList<String> header, GenomeLoc loc, String[] data) {
|
||||
this(name, header, loc);
|
||||
|
||||
if ( header.size() != data.length + 1 )
|
||||
throw new RuntimeException(String.format("Incorrect tabular data format: header has %d columns but %d data elements were provided: %s",
|
||||
header.size(), data.length, Utils.join("\t", data)));
|
||||
for ( int i = 0; i < data.length; i++ ) {
|
||||
put(header.get(i+1), data[i]);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -61,7 +161,8 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map<String
|
|||
Matcher m = HEADER_PATTERN.matcher(line);
|
||||
if ( m.matches() ) {
|
||||
//System.out.printf("Found a header line: %s%n", line);
|
||||
header = new ArrayList<String>(Arrays.asList(line.split("\\s+")));
|
||||
header = new ArrayList<String>(Arrays.asList(line.split(DELIMITER_REGEX)));
|
||||
//System.out.printf("HEADER IS %s%n", Utils.join(":", header));
|
||||
}
|
||||
|
||||
if ( linesLookedAt++ > MAX_LINES_TO_LOOK_FOR_HEADER )
|
||||
|
|
@ -75,23 +176,22 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map<String
|
|||
throw new RuntimeException("Couldn't find header line in TabularROD!");
|
||||
}
|
||||
|
||||
//System.exit(1);
|
||||
return header;
|
||||
}
|
||||
|
||||
private static int MAX_LINES_TO_LOOK_FOR_HEADER = 1000;
|
||||
private static Pattern HEADER_PATTERN = Pattern.compile("^\\s*HEADER.*");
|
||||
private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Accessors
|
||||
// ROD accessors
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
public GenomeLoc getLocation() {
|
||||
return loc;
|
||||
}
|
||||
|
||||
public ArrayList<String> getHeader() {
|
||||
return header;
|
||||
}
|
||||
|
||||
public String get(final Object key) {
|
||||
return attributes.get(key);
|
||||
}
|
||||
|
|
@ -113,7 +213,7 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map<String
|
|||
for ( String key : header ) {
|
||||
if ( containsKey(key) ) { // avoid the header
|
||||
strings.add(this.get(key));
|
||||
System.out.printf("Adding %s%n", this.get(key));
|
||||
//System.out.printf("Adding %s%n", this.get(key));
|
||||
}
|
||||
}
|
||||
return Utils.join("\t", strings);
|
||||
|
|
@ -149,10 +249,24 @@ public class TabularROD extends BasicReferenceOrderedDatum implements Map<String
|
|||
return String.format("%s\t%s", getLocation(), getAttributeString());
|
||||
}
|
||||
|
||||
public String delimiter() {
|
||||
return "\\s+";
|
||||
/**
|
||||
* The delimiter regular expression that should be used to separate fields in data rows
|
||||
* and header.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public String delimiterRegex() {
|
||||
return DELIMITER_REGEX;
|
||||
}
|
||||
|
||||
/**
|
||||
* Used by ROD management system to set the data in this ROD associated with a line in a rod
|
||||
*
|
||||
* @param headerObj
|
||||
* @param parts
|
||||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
public boolean parseLine(final Object headerObj, final String[] parts) throws IOException {
|
||||
header = (ArrayList<String>)(headerObj);
|
||||
|
||||
|
|
|
|||
|
|
@ -12,8 +12,12 @@ import org.broadinstitute.sting.utils.RefHanger;
|
|||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.PrintStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* Basic unit test for TabularROD
|
||||
|
|
@ -23,8 +27,7 @@ public class TabularRODTest extends BaseTest {
|
|||
private static FastaSequenceFile2 seq;
|
||||
private ReferenceOrderedData ROD;
|
||||
private ReferenceOrderedData.RODIterator iter;
|
||||
private ReferenceOrderedData ROD2;
|
||||
private ReferenceOrderedData.RODIterator iter2;
|
||||
|
||||
|
||||
@BeforeClass
|
||||
public static void init() {
|
||||
|
|
@ -35,13 +38,11 @@ public class TabularRODTest extends BaseTest {
|
|||
|
||||
@Before
|
||||
public void setupTabularROD() {
|
||||
TabularROD.setDelimiter(TabularROD.DEFAULT_DELIMITER, TabularROD.DEFAULT_DELIMITER_REGEX);
|
||||
File file = new File(testDir + "TabularDataTest.dat");
|
||||
ROD = new ReferenceOrderedData("tableTest", file, TabularROD.class);
|
||||
iter = ROD.iterator();
|
||||
|
||||
File file2 = new File(testDir + "TabularDataTest2.dat");
|
||||
ROD2 = new ReferenceOrderedData("tableTest", file2, TabularROD.class);
|
||||
iter2 = ROD2.iterator();
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
@ -107,10 +108,15 @@ public class TabularRODTest extends BaseTest {
|
|||
assertTrue(one.toString().equals("chrM:10\tA\tB\tC"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test2p1() {
|
||||
logger.warn("Executing test2p1");
|
||||
TabularROD one2 = (TabularROD)iter2.next();
|
||||
// Didn't change the delimiter
|
||||
@Test (expected = RuntimeException.class)
|
||||
public void testDelim1() {
|
||||
File file2 = new File(testDir + "TabularDataTest2.dat");
|
||||
ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", file2, TabularROD.class);
|
||||
ReferenceOrderedData.RODIterator iter_commas = ROD_commas.iterator();
|
||||
|
||||
logger.warn("Executing testDelim1");
|
||||
TabularROD one2 = (TabularROD)iter_commas.next();
|
||||
assertTrue(one2.size() == 4);
|
||||
assertTrue(one2.getLocation().equals(new GenomeLoc("chrM", 10)));
|
||||
assertTrue(one2.get("COL1").equals("A"));
|
||||
|
|
@ -118,4 +124,92 @@ public class TabularRODTest extends BaseTest {
|
|||
assertTrue(one2.get("COL3").equals("C"));
|
||||
assertTrue(one2.get("COL4").equals("1"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDelim2() {
|
||||
TabularROD.setDelimiter(",",",");
|
||||
File file2 = new File(testDir + "TabularDataTest2.dat");
|
||||
ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", file2, TabularROD.class);
|
||||
ReferenceOrderedData.RODIterator iter_commas = ROD_commas.iterator();
|
||||
|
||||
logger.warn("Executing testDelim1");
|
||||
TabularROD one2 = (TabularROD)iter_commas.next();
|
||||
assertTrue(one2.size() == 4);
|
||||
assertTrue(one2.getLocation().equals(new GenomeLoc("chrM", 10)));
|
||||
assertTrue(one2.get("COL1").equals("A"));
|
||||
assertTrue(one2.get("COL2").equals("B"));
|
||||
assertTrue(one2.get("COL3").equals("C"));
|
||||
assertTrue(one2.get("COL4").equals("1"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCreation() {
|
||||
logger.warn("Executing testCreation");
|
||||
ArrayList<String> header = new ArrayList<String>(Arrays.asList("HEADER", "col1", "col2", "col3"));
|
||||
assertTrue(TabularROD.headerString(header).equals("HEADER\tcol1\tcol2\tcol3"));
|
||||
String rowData = String.format("%d %d %d", 1, 2, 3);
|
||||
TabularROD row = new TabularROD("myName", header, new GenomeLoc("chrM", 1), rowData.split(" "));
|
||||
assertTrue(row.toString().equals("chrM:1\t1\t2\t3"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCreationAndWriting() throws FileNotFoundException {
|
||||
logger.warn("Executing testCreationAndWriting");
|
||||
|
||||
File outputFile = new File(testDir + "testTabularRodOutputTemp.dat");
|
||||
PrintStream out = new PrintStream(new FileOutputStream(outputFile));
|
||||
|
||||
ArrayList<String> header = new ArrayList<String>(Arrays.asList("HEADER", "col1", "col2", "col3"));
|
||||
out.println(TabularROD.commentString("Hello, created from test"));
|
||||
out.println(TabularROD.commentString(""));
|
||||
out.println(TabularROD.headerString(header));
|
||||
|
||||
String rowData = String.format("%d %d %d", 1, 2, 3);
|
||||
TabularROD row = new TabularROD("myName", header, new GenomeLoc("chrM", 1), rowData.split(" "));
|
||||
out.println(row.toString());
|
||||
|
||||
rowData = String.format("%d %d %d", 3, 4, 5);
|
||||
row = new TabularROD("myName", header, new GenomeLoc("chrM", 2), rowData.split(" "));
|
||||
out.println(row.toString());
|
||||
|
||||
ReferenceOrderedData ROD_commas = new ReferenceOrderedData("tableTest", outputFile, TabularROD.class);
|
||||
ReferenceOrderedData.RODIterator iter_commas = ROD_commas.iterator();
|
||||
|
||||
TabularROD one = (TabularROD)iter_commas.next();
|
||||
assertTrue(one.size() == 3);
|
||||
assertTrue(one.getLocation().equals(new GenomeLoc("chrM", 1)));
|
||||
assertTrue(one.get("col1").equals("1"));
|
||||
assertTrue(one.get("col2").equals("2"));
|
||||
assertTrue(one.get("col3").equals("3"));
|
||||
|
||||
TabularROD two = (TabularROD)iter_commas.next();
|
||||
assertTrue(two.size() == 3);
|
||||
assertTrue(two.getLocation().equals(new GenomeLoc("chrM", 2)));
|
||||
assertTrue(two.get("col1").equals("3"));
|
||||
assertTrue(two.get("col2").equals("4"));
|
||||
assertTrue(two.get("col3").equals("5"));
|
||||
}
|
||||
|
||||
@Test (expected=RuntimeException.class )
|
||||
public void testBadHeader1() {
|
||||
logger.warn("Executing testBadHeader1");
|
||||
ArrayList<String> header = new ArrayList<String>();
|
||||
TabularROD row = new TabularROD("myName", header, new GenomeLoc("chrM", 1));
|
||||
}
|
||||
|
||||
@Test (expected=RuntimeException.class )
|
||||
public void testBadHeader2() {
|
||||
logger.warn("Executing testBadHeader2");
|
||||
ArrayList<String> header = new ArrayList<String>(Arrays.asList("col1", "col2", "col3"));
|
||||
TabularROD row = new TabularROD("myName", header, new GenomeLoc("chrM", 1));
|
||||
}
|
||||
|
||||
@Test (expected=RuntimeException.class )
|
||||
public void testBadData1() {
|
||||
logger.warn("Executing testBadData1");
|
||||
ArrayList<String> header = new ArrayList<String>(Arrays.asList("HEADER", "col1", "col2", "col3"));
|
||||
assertTrue(TabularROD.headerString(header).equals("HEADER\tcol1\tcol2\tcol3"));
|
||||
String rowData = String.format("%d %d %d %d", 1, 2, 3, 4);
|
||||
TabularROD row = new TabularROD("myName", header, new GenomeLoc("chrM", 1), rowData.split(" "));
|
||||
}
|
||||
}
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
# comment
|
||||
# comment line
|
||||
#
|
||||
HEADER COL1 COL2 COL3 COL4
|
||||
chrM:10 A B C 1
|
||||
chrM:20 C D E 2
|
||||
chrM:30 F G H 3
|
||||
HEADER,COL1,COL2,COL3,COL4
|
||||
chrM:10,A,B,C,1
|
||||
chrM:20,C,D,E,2
|
||||
chrM:30,F,G,H,3
|
||||
Loading…
Reference in New Issue