GFF now parses attributes correctly and efficiently. Slightly better interface to Utils.join

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@253 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-04-01 22:54:38 +00:00
parent ce57fed2fb
commit d952790258
4 changed files with 94 additions and 14 deletions

View File

@ -158,7 +158,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
List<ReferenceOrderedData> rods = new ArrayList<ReferenceOrderedData>();
if (TEST_ROD) {
ReferenceOrderedData gff = new ReferenceOrderedData(new File("trunk/data/gFFTest.gff"), rodGFF.class);
ReferenceOrderedData gff = new ReferenceOrderedData(new File("single.gff"), rodGFF.class);
gff.testMe();
//ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File("trunk/data/dbSNP_head.txt"), rodDbSNP.class );

View File

@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.refdata;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.FileNotFoundException;
import java.util.Iterator;
import java.util.ArrayList;
import java.util.Collections;
@ -10,6 +11,8 @@ import java.util.Collections;
import edu.mit.broad.picard.util.TabbedTextFileParser;
import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.xReadLines;
import org.broadinstitute.sting.utils.Utils;
/**
* Class for representing arbitrary reference ordered data sets
@ -38,12 +41,14 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
//
// ----------------------------------------------------------------------
public void testMe() {
ReferenceOrderedDatum last = null;
for ( ReferenceOrderedDatum rec : this ) {
if ( last == null || ! last.getLocation().onSameContig(rec.getLocation()) ) {
System.out.println(rec.toString());
System.out.println(rec.toString());
rodGFF gff = (rodGFF)rec;
String[] keys = {"LENGTH", "ALT", "FOBARBAR"};
for ( String key : keys) {
System.out.printf(" -> %s is (%s)%n", key, gff.containsAttribute(key) ? gff.getAttribute(key) : "none");
}
last = rec;
}
System.exit(1);
}
@ -98,19 +103,25 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
//
// ----------------------------------------------------------------------
private class SimpleRODIterator implements Iterator<ROD> {
//private WhitespaceTextFileParser parser = null;
private TabbedTextFileParser parser = null;
private xReadLines parser = null;
public SimpleRODIterator() {
parser = new TabbedTextFileParser(true, file);
try {
parser = new xReadLines(file);
} catch ( FileNotFoundException e ) {
Utils.scareUser("Couldn't open file: " + file);
}
}
public boolean hasNext() {
//System.out.printf("Parser has next: %b%n", parser.hasNext());
return parser.hasNext();
}
public ROD next() {
String parts[] = parser.next();
final String line = parser.next();
//System.out.printf("Line is %s%n", line);
String parts[] = line.split("\t");
return parseLine(parts);
}
@ -119,6 +130,28 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
}
}
// private class SimpleRODIterator implements Iterator<ROD> {
// //private WhitespaceTextFileParser parser = null;
// private TabbedTextFileParser parser = null;
//
// public SimpleRODIterator() {
// parser = new TabbedTextFileParser(true, file);
// }
//
// public boolean hasNext() {
// return parser.hasNext();
// }
//
// public ROD next() {
// String parts[] = parser.next();
// return parseLine(parts);
// }
//
// public void remove() {
// throw new UnsupportedOperationException();
// }
// }
public class RODIterator implements Iterator<ROD> {
private PushbackIterator<ROD> it;
private ROD prev = null;

View File

@ -1,9 +1,14 @@
package org.broadinstitute.sting.gatk.refdata;
import java.util.HashMap;
import java.util.Scanner;
import java.util.Map;
import java.util.regex.MatchResult;
import java.util.regex.Pattern;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.Utils;
/**
* Class for representing arbitrary reference ordered data sets
@ -75,13 +80,31 @@ public class rodGFF extends ReferenceOrderedDatum {
return attributes.get(key);
}
public boolean containsAttribute(final String key) {
return attributes.containsKey(key);
}
public HashMap<String,String> getAttributes() {
return attributes;
}
public String getAttributeString() {
String[] strings = new String[attributes.size()];
int i = 0;
for ( Map.Entry<String, String> pair : attributes.entrySet() ) {
strings[i++] = pair.getKey() + " " + pair.getValue();
//strings[i++] = "(" + pair.getKey() + ") (" + pair.getValue() + ")";
}
return Utils.join(" ; ", strings);
}
// ----------------------------------------------------------------------
//
// formatting
//
// ----------------------------------------------------------------------
public String toString() {
return String.format("%s\t%s\t%s\t%d\t%d\t%f\t%s\t%s", contig, source, feature, start, stop, score, strand, frame);
return String.format("%s\t%s\t%s\t%d\t%d\t%f\t%s\t%s\t%s", contig, source, feature, start, stop, score, strand, frame, getAttributeString());
}
public String repl() {
@ -92,6 +115,25 @@ public class rodGFF extends ReferenceOrderedDatum {
return String.format("%s", feature);
}
private static Pattern GFF_DELIM = Pattern.compile("\\s+;\\s*");
private static Pattern GFF_ATTRIBUTE_PATTERN = Pattern.compile("([A-Za-z][A-Za-z0-9_]*)((?:\\s+\\S+)+)");
final private HashMap<String, String> parseAttributes( final String attributeLine ) {
HashMap<String, String> attributes = new HashMap<String, String>();
Scanner scanner = new Scanner(attributeLine);
scanner.useDelimiter(GFF_DELIM);
while ( scanner.hasNext(GFF_ATTRIBUTE_PATTERN) ) {
MatchResult result = scanner.match();
String key = result.group(1);
String value = result.group(2).replace("\"", "").trim();
//System.out.printf(" Adding %s / %s (total %d)%n", key, value, result.groupCount());
attributes.put(key, value);
String n = scanner.next();
//System.out.printf(" next is %s%n", n);
}
return attributes;
}
public void parseLine(final String[] parts) {
//System.out.printf("Parsing GFFLine %s%n", Utils.join(" ", parts));
@ -107,7 +149,8 @@ public class rodGFF extends ReferenceOrderedDatum {
final String strand = parts[6];
final String frame = parts[7];
HashMap<String, String> attributes = null;
final String attributeParts = Utils.join(" ", parts, 8, parts.length);
HashMap<String, String> attributes = parseAttributes(attributeParts);
setValues(contig, source, feature, start, stop, score, strand, frame, attributes);
}
}

View File

@ -212,11 +212,15 @@ public class Utils {
}
public static String join(String separator, String[] strings) {
if (strings.length == 0) {
return join(separator, strings, 0, strings.length);
}
public static String join(String separator, String[] strings, int start, int end) {
if ((end - start) == 0) {
return "";
}
StringBuilder ret = new StringBuilder(strings[0]);
for (int i = 1; i < strings.length; ++i) {
StringBuilder ret = new StringBuilder(strings[start]);
for (int i = start+1; i < end; ++i) {
ret.append(separator);
ret.append(strings[i]);
}