GFF now parses attributes correctly and efficiently. Slightly better interface to Utils.join
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@253 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
ce57fed2fb
commit
d952790258
|
|
@ -158,7 +158,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
|||
List<ReferenceOrderedData> rods = new ArrayList<ReferenceOrderedData>();
|
||||
|
||||
if (TEST_ROD) {
|
||||
ReferenceOrderedData gff = new ReferenceOrderedData(new File("trunk/data/gFFTest.gff"), rodGFF.class);
|
||||
ReferenceOrderedData gff = new ReferenceOrderedData(new File("single.gff"), rodGFF.class);
|
||||
gff.testMe();
|
||||
|
||||
//ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File("trunk/data/dbSNP_head.txt"), rodDbSNP.class );
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.refdata;
|
|||
import java.io.File;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.util.Iterator;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
|
|
@ -10,6 +11,8 @@ import java.util.Collections;
|
|||
import edu.mit.broad.picard.util.TabbedTextFileParser;
|
||||
import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.xReadLines;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
||||
/**
|
||||
* Class for representing arbitrary reference ordered data sets
|
||||
|
|
@ -38,12 +41,14 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
|||
//
|
||||
// ----------------------------------------------------------------------
|
||||
public void testMe() {
|
||||
ReferenceOrderedDatum last = null;
|
||||
for ( ReferenceOrderedDatum rec : this ) {
|
||||
if ( last == null || ! last.getLocation().onSameContig(rec.getLocation()) ) {
|
||||
System.out.println(rec.toString());
|
||||
System.out.println(rec.toString());
|
||||
|
||||
rodGFF gff = (rodGFF)rec;
|
||||
String[] keys = {"LENGTH", "ALT", "FOBARBAR"};
|
||||
for ( String key : keys) {
|
||||
System.out.printf(" -> %s is (%s)%n", key, gff.containsAttribute(key) ? gff.getAttribute(key) : "none");
|
||||
}
|
||||
last = rec;
|
||||
}
|
||||
System.exit(1);
|
||||
}
|
||||
|
|
@ -98,19 +103,25 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
|||
//
|
||||
// ----------------------------------------------------------------------
|
||||
private class SimpleRODIterator implements Iterator<ROD> {
|
||||
//private WhitespaceTextFileParser parser = null;
|
||||
private TabbedTextFileParser parser = null;
|
||||
private xReadLines parser = null;
|
||||
|
||||
public SimpleRODIterator() {
|
||||
parser = new TabbedTextFileParser(true, file);
|
||||
try {
|
||||
parser = new xReadLines(file);
|
||||
} catch ( FileNotFoundException e ) {
|
||||
Utils.scareUser("Couldn't open file: " + file);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean hasNext() {
|
||||
//System.out.printf("Parser has next: %b%n", parser.hasNext());
|
||||
return parser.hasNext();
|
||||
}
|
||||
|
||||
public ROD next() {
|
||||
String parts[] = parser.next();
|
||||
final String line = parser.next();
|
||||
//System.out.printf("Line is %s%n", line);
|
||||
String parts[] = line.split("\t");
|
||||
return parseLine(parts);
|
||||
}
|
||||
|
||||
|
|
@ -119,6 +130,28 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
|||
}
|
||||
}
|
||||
|
||||
// private class SimpleRODIterator implements Iterator<ROD> {
|
||||
// //private WhitespaceTextFileParser parser = null;
|
||||
// private TabbedTextFileParser parser = null;
|
||||
//
|
||||
// public SimpleRODIterator() {
|
||||
// parser = new TabbedTextFileParser(true, file);
|
||||
// }
|
||||
//
|
||||
// public boolean hasNext() {
|
||||
// return parser.hasNext();
|
||||
// }
|
||||
//
|
||||
// public ROD next() {
|
||||
// String parts[] = parser.next();
|
||||
// return parseLine(parts);
|
||||
// }
|
||||
//
|
||||
// public void remove() {
|
||||
// throw new UnsupportedOperationException();
|
||||
// }
|
||||
// }
|
||||
|
||||
public class RODIterator implements Iterator<ROD> {
|
||||
private PushbackIterator<ROD> it;
|
||||
private ROD prev = null;
|
||||
|
|
|
|||
|
|
@ -1,9 +1,14 @@
|
|||
package org.broadinstitute.sting.gatk.refdata;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Scanner;
|
||||
import java.util.Map;
|
||||
import java.util.regex.MatchResult;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
|
||||
/**
|
||||
* Class for representing arbitrary reference ordered data sets
|
||||
|
|
@ -75,13 +80,31 @@ public class rodGFF extends ReferenceOrderedDatum {
|
|||
return attributes.get(key);
|
||||
}
|
||||
|
||||
public boolean containsAttribute(final String key) {
|
||||
return attributes.containsKey(key);
|
||||
}
|
||||
|
||||
public HashMap<String,String> getAttributes() {
|
||||
return attributes;
|
||||
}
|
||||
|
||||
public String getAttributeString() {
|
||||
String[] strings = new String[attributes.size()];
|
||||
int i = 0;
|
||||
for ( Map.Entry<String, String> pair : attributes.entrySet() ) {
|
||||
strings[i++] = pair.getKey() + " " + pair.getValue();
|
||||
//strings[i++] = "(" + pair.getKey() + ") (" + pair.getValue() + ")";
|
||||
}
|
||||
return Utils.join(" ; ", strings);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// formatting
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
public String toString() {
|
||||
return String.format("%s\t%s\t%s\t%d\t%d\t%f\t%s\t%s", contig, source, feature, start, stop, score, strand, frame);
|
||||
return String.format("%s\t%s\t%s\t%d\t%d\t%f\t%s\t%s\t%s", contig, source, feature, start, stop, score, strand, frame, getAttributeString());
|
||||
}
|
||||
|
||||
public String repl() {
|
||||
|
|
@ -92,6 +115,25 @@ public class rodGFF extends ReferenceOrderedDatum {
|
|||
return String.format("%s", feature);
|
||||
}
|
||||
|
||||
|
||||
private static Pattern GFF_DELIM = Pattern.compile("\\s+;\\s*");
|
||||
private static Pattern GFF_ATTRIBUTE_PATTERN = Pattern.compile("([A-Za-z][A-Za-z0-9_]*)((?:\\s+\\S+)+)");
|
||||
final private HashMap<String, String> parseAttributes( final String attributeLine ) {
|
||||
HashMap<String, String> attributes = new HashMap<String, String>();
|
||||
Scanner scanner = new Scanner(attributeLine);
|
||||
scanner.useDelimiter(GFF_DELIM);
|
||||
while ( scanner.hasNext(GFF_ATTRIBUTE_PATTERN) ) {
|
||||
MatchResult result = scanner.match();
|
||||
String key = result.group(1);
|
||||
String value = result.group(2).replace("\"", "").trim();
|
||||
//System.out.printf(" Adding %s / %s (total %d)%n", key, value, result.groupCount());
|
||||
attributes.put(key, value);
|
||||
String n = scanner.next();
|
||||
//System.out.printf(" next is %s%n", n);
|
||||
}
|
||||
return attributes;
|
||||
}
|
||||
|
||||
public void parseLine(final String[] parts) {
|
||||
//System.out.printf("Parsing GFFLine %s%n", Utils.join(" ", parts));
|
||||
|
||||
|
|
@ -107,7 +149,8 @@ public class rodGFF extends ReferenceOrderedDatum {
|
|||
|
||||
final String strand = parts[6];
|
||||
final String frame = parts[7];
|
||||
HashMap<String, String> attributes = null;
|
||||
final String attributeParts = Utils.join(" ", parts, 8, parts.length);
|
||||
HashMap<String, String> attributes = parseAttributes(attributeParts);
|
||||
setValues(contig, source, feature, start, stop, score, strand, frame, attributes);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -212,11 +212,15 @@ public class Utils {
|
|||
}
|
||||
|
||||
public static String join(String separator, String[] strings) {
|
||||
if (strings.length == 0) {
|
||||
return join(separator, strings, 0, strings.length);
|
||||
}
|
||||
|
||||
public static String join(String separator, String[] strings, int start, int end) {
|
||||
if ((end - start) == 0) {
|
||||
return "";
|
||||
}
|
||||
StringBuilder ret = new StringBuilder(strings[0]);
|
||||
for (int i = 1; i < strings.length; ++i) {
|
||||
StringBuilder ret = new StringBuilder(strings[start]);
|
||||
for (int i = start+1; i < end; ++i) {
|
||||
ret.append(separator);
|
||||
ret.append(strings[i]);
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue