GFF now parses attributes correctly and efficiently. Slightly better interface to Utils.join
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@253 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
ce57fed2fb
commit
d952790258
|
|
@ -158,7 +158,7 @@ public class GenomeAnalysisTK extends CommandLineProgram {
|
||||||
List<ReferenceOrderedData> rods = new ArrayList<ReferenceOrderedData>();
|
List<ReferenceOrderedData> rods = new ArrayList<ReferenceOrderedData>();
|
||||||
|
|
||||||
if (TEST_ROD) {
|
if (TEST_ROD) {
|
||||||
ReferenceOrderedData gff = new ReferenceOrderedData(new File("trunk/data/gFFTest.gff"), rodGFF.class);
|
ReferenceOrderedData gff = new ReferenceOrderedData(new File("single.gff"), rodGFF.class);
|
||||||
gff.testMe();
|
gff.testMe();
|
||||||
|
|
||||||
//ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File("trunk/data/dbSNP_head.txt"), rodDbSNP.class );
|
//ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File("trunk/data/dbSNP_head.txt"), rodDbSNP.class );
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.refdata;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileWriter;
|
import java.io.FileWriter;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.io.FileNotFoundException;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
|
|
@ -10,6 +11,8 @@ import java.util.Collections;
|
||||||
import edu.mit.broad.picard.util.TabbedTextFileParser;
|
import edu.mit.broad.picard.util.TabbedTextFileParser;
|
||||||
import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
|
import org.broadinstitute.sting.gatk.iterators.PushbackIterator;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.xReadLines;
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Class for representing arbitrary reference ordered data sets
|
* Class for representing arbitrary reference ordered data sets
|
||||||
|
|
@ -38,12 +41,14 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
||||||
//
|
//
|
||||||
// ----------------------------------------------------------------------
|
// ----------------------------------------------------------------------
|
||||||
public void testMe() {
|
public void testMe() {
|
||||||
ReferenceOrderedDatum last = null;
|
|
||||||
for ( ReferenceOrderedDatum rec : this ) {
|
for ( ReferenceOrderedDatum rec : this ) {
|
||||||
if ( last == null || ! last.getLocation().onSameContig(rec.getLocation()) ) {
|
|
||||||
System.out.println(rec.toString());
|
System.out.println(rec.toString());
|
||||||
|
|
||||||
|
rodGFF gff = (rodGFF)rec;
|
||||||
|
String[] keys = {"LENGTH", "ALT", "FOBARBAR"};
|
||||||
|
for ( String key : keys) {
|
||||||
|
System.out.printf(" -> %s is (%s)%n", key, gff.containsAttribute(key) ? gff.getAttribute(key) : "none");
|
||||||
}
|
}
|
||||||
last = rec;
|
|
||||||
}
|
}
|
||||||
System.exit(1);
|
System.exit(1);
|
||||||
}
|
}
|
||||||
|
|
@ -98,19 +103,25 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
||||||
//
|
//
|
||||||
// ----------------------------------------------------------------------
|
// ----------------------------------------------------------------------
|
||||||
private class SimpleRODIterator implements Iterator<ROD> {
|
private class SimpleRODIterator implements Iterator<ROD> {
|
||||||
//private WhitespaceTextFileParser parser = null;
|
private xReadLines parser = null;
|
||||||
private TabbedTextFileParser parser = null;
|
|
||||||
|
|
||||||
public SimpleRODIterator() {
|
public SimpleRODIterator() {
|
||||||
parser = new TabbedTextFileParser(true, file);
|
try {
|
||||||
|
parser = new xReadLines(file);
|
||||||
|
} catch ( FileNotFoundException e ) {
|
||||||
|
Utils.scareUser("Couldn't open file: " + file);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean hasNext() {
|
public boolean hasNext() {
|
||||||
|
//System.out.printf("Parser has next: %b%n", parser.hasNext());
|
||||||
return parser.hasNext();
|
return parser.hasNext();
|
||||||
}
|
}
|
||||||
|
|
||||||
public ROD next() {
|
public ROD next() {
|
||||||
String parts[] = parser.next();
|
final String line = parser.next();
|
||||||
|
//System.out.printf("Line is %s%n", line);
|
||||||
|
String parts[] = line.split("\t");
|
||||||
return parseLine(parts);
|
return parseLine(parts);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -119,6 +130,28 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// private class SimpleRODIterator implements Iterator<ROD> {
|
||||||
|
// //private WhitespaceTextFileParser parser = null;
|
||||||
|
// private TabbedTextFileParser parser = null;
|
||||||
|
//
|
||||||
|
// public SimpleRODIterator() {
|
||||||
|
// parser = new TabbedTextFileParser(true, file);
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// public boolean hasNext() {
|
||||||
|
// return parser.hasNext();
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// public ROD next() {
|
||||||
|
// String parts[] = parser.next();
|
||||||
|
// return parseLine(parts);
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// public void remove() {
|
||||||
|
// throw new UnsupportedOperationException();
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
|
||||||
public class RODIterator implements Iterator<ROD> {
|
public class RODIterator implements Iterator<ROD> {
|
||||||
private PushbackIterator<ROD> it;
|
private PushbackIterator<ROD> it;
|
||||||
private ROD prev = null;
|
private ROD prev = null;
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,14 @@
|
||||||
package org.broadinstitute.sting.gatk.refdata;
|
package org.broadinstitute.sting.gatk.refdata;
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
import java.util.Scanner;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.regex.MatchResult;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||||
|
import org.broadinstitute.sting.utils.Utils;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Class for representing arbitrary reference ordered data sets
|
* Class for representing arbitrary reference ordered data sets
|
||||||
|
|
@ -75,13 +80,31 @@ public class rodGFF extends ReferenceOrderedDatum {
|
||||||
return attributes.get(key);
|
return attributes.get(key);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public boolean containsAttribute(final String key) {
|
||||||
|
return attributes.containsKey(key);
|
||||||
|
}
|
||||||
|
|
||||||
|
public HashMap<String,String> getAttributes() {
|
||||||
|
return attributes;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getAttributeString() {
|
||||||
|
String[] strings = new String[attributes.size()];
|
||||||
|
int i = 0;
|
||||||
|
for ( Map.Entry<String, String> pair : attributes.entrySet() ) {
|
||||||
|
strings[i++] = pair.getKey() + " " + pair.getValue();
|
||||||
|
//strings[i++] = "(" + pair.getKey() + ") (" + pair.getValue() + ")";
|
||||||
|
}
|
||||||
|
return Utils.join(" ; ", strings);
|
||||||
|
}
|
||||||
|
|
||||||
// ----------------------------------------------------------------------
|
// ----------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
// formatting
|
// formatting
|
||||||
//
|
//
|
||||||
// ----------------------------------------------------------------------
|
// ----------------------------------------------------------------------
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return String.format("%s\t%s\t%s\t%d\t%d\t%f\t%s\t%s", contig, source, feature, start, stop, score, strand, frame);
|
return String.format("%s\t%s\t%s\t%d\t%d\t%f\t%s\t%s\t%s", contig, source, feature, start, stop, score, strand, frame, getAttributeString());
|
||||||
}
|
}
|
||||||
|
|
||||||
public String repl() {
|
public String repl() {
|
||||||
|
|
@ -92,6 +115,25 @@ public class rodGFF extends ReferenceOrderedDatum {
|
||||||
return String.format("%s", feature);
|
return String.format("%s", feature);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private static Pattern GFF_DELIM = Pattern.compile("\\s+;\\s*");
|
||||||
|
private static Pattern GFF_ATTRIBUTE_PATTERN = Pattern.compile("([A-Za-z][A-Za-z0-9_]*)((?:\\s+\\S+)+)");
|
||||||
|
final private HashMap<String, String> parseAttributes( final String attributeLine ) {
|
||||||
|
HashMap<String, String> attributes = new HashMap<String, String>();
|
||||||
|
Scanner scanner = new Scanner(attributeLine);
|
||||||
|
scanner.useDelimiter(GFF_DELIM);
|
||||||
|
while ( scanner.hasNext(GFF_ATTRIBUTE_PATTERN) ) {
|
||||||
|
MatchResult result = scanner.match();
|
||||||
|
String key = result.group(1);
|
||||||
|
String value = result.group(2).replace("\"", "").trim();
|
||||||
|
//System.out.printf(" Adding %s / %s (total %d)%n", key, value, result.groupCount());
|
||||||
|
attributes.put(key, value);
|
||||||
|
String n = scanner.next();
|
||||||
|
//System.out.printf(" next is %s%n", n);
|
||||||
|
}
|
||||||
|
return attributes;
|
||||||
|
}
|
||||||
|
|
||||||
public void parseLine(final String[] parts) {
|
public void parseLine(final String[] parts) {
|
||||||
//System.out.printf("Parsing GFFLine %s%n", Utils.join(" ", parts));
|
//System.out.printf("Parsing GFFLine %s%n", Utils.join(" ", parts));
|
||||||
|
|
||||||
|
|
@ -107,7 +149,8 @@ public class rodGFF extends ReferenceOrderedDatum {
|
||||||
|
|
||||||
final String strand = parts[6];
|
final String strand = parts[6];
|
||||||
final String frame = parts[7];
|
final String frame = parts[7];
|
||||||
HashMap<String, String> attributes = null;
|
final String attributeParts = Utils.join(" ", parts, 8, parts.length);
|
||||||
|
HashMap<String, String> attributes = parseAttributes(attributeParts);
|
||||||
setValues(contig, source, feature, start, stop, score, strand, frame, attributes);
|
setValues(contig, source, feature, start, stop, score, strand, frame, attributes);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -212,11 +212,15 @@ public class Utils {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String join(String separator, String[] strings) {
|
public static String join(String separator, String[] strings) {
|
||||||
if (strings.length == 0) {
|
return join(separator, strings, 0, strings.length);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String join(String separator, String[] strings, int start, int end) {
|
||||||
|
if ((end - start) == 0) {
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
StringBuilder ret = new StringBuilder(strings[0]);
|
StringBuilder ret = new StringBuilder(strings[start]);
|
||||||
for (int i = 1; i < strings.length; ++i) {
|
for (int i = start+1; i < end; ++i) {
|
||||||
ret.append(separator);
|
ret.append(separator);
|
||||||
ret.append(strings[i]);
|
ret.append(strings[i]);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue