From d9527902589da4742e45c0e79eefc542932e2278 Mon Sep 17 00:00:00 2001 From: depristo Date: Wed, 1 Apr 2009 22:54:38 +0000 Subject: [PATCH] GFF now parses attributes correctly and efficiently. Slightly better interface to Utils.join git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@253 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/GenomeAnalysisTK.java | 2 +- .../gatk/refdata/ReferenceOrderedData.java | 49 ++++++++++++++++--- .../sting/gatk/refdata/rodGFF.java | 47 +++++++++++++++++- .../org/broadinstitute/sting/utils/Utils.java | 10 ++-- 4 files changed, 94 insertions(+), 14 deletions(-) diff --git a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java index 95102c7d5..168fe0d09 100644 --- a/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java +++ b/java/src/org/broadinstitute/sting/gatk/GenomeAnalysisTK.java @@ -158,7 +158,7 @@ public class GenomeAnalysisTK extends CommandLineProgram { List rods = new ArrayList(); if (TEST_ROD) { - ReferenceOrderedData gff = new ReferenceOrderedData(new File("trunk/data/gFFTest.gff"), rodGFF.class); + ReferenceOrderedData gff = new ReferenceOrderedData(new File("single.gff"), rodGFF.class); gff.testMe(); //ReferenceOrderedData dbsnp = new ReferenceOrderedData(new File("trunk/data/dbSNP_head.txt"), rodDbSNP.class ); diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java index 2263ecfed..33343b735 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java @@ -3,6 +3,7 @@ package org.broadinstitute.sting.gatk.refdata; import java.io.File; import java.io.FileWriter; import java.io.IOException; +import java.io.FileNotFoundException; import java.util.Iterator; import java.util.ArrayList; import java.util.Collections; @@ -10,6 +11,8 @@ import java.util.Collections; import edu.mit.broad.picard.util.TabbedTextFileParser; import org.broadinstitute.sting.gatk.iterators.PushbackIterator; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.xReadLines; +import org.broadinstitute.sting.utils.Utils; /** * Class for representing arbitrary reference ordered data sets @@ -38,12 +41,14 @@ public class ReferenceOrderedData implements // // ---------------------------------------------------------------------- public void testMe() { - ReferenceOrderedDatum last = null; for ( ReferenceOrderedDatum rec : this ) { - if ( last == null || ! last.getLocation().onSameContig(rec.getLocation()) ) { - System.out.println(rec.toString()); + System.out.println(rec.toString()); + + rodGFF gff = (rodGFF)rec; + String[] keys = {"LENGTH", "ALT", "FOBARBAR"}; + for ( String key : keys) { + System.out.printf(" -> %s is (%s)%n", key, gff.containsAttribute(key) ? gff.getAttribute(key) : "none"); } - last = rec; } System.exit(1); } @@ -98,19 +103,25 @@ public class ReferenceOrderedData implements // // ---------------------------------------------------------------------- private class SimpleRODIterator implements Iterator { - //private WhitespaceTextFileParser parser = null; - private TabbedTextFileParser parser = null; + private xReadLines parser = null; public SimpleRODIterator() { - parser = new TabbedTextFileParser(true, file); + try { + parser = new xReadLines(file); + } catch ( FileNotFoundException e ) { + Utils.scareUser("Couldn't open file: " + file); + } } public boolean hasNext() { + //System.out.printf("Parser has next: %b%n", parser.hasNext()); return parser.hasNext(); } public ROD next() { - String parts[] = parser.next(); + final String line = parser.next(); + //System.out.printf("Line is %s%n", line); + String parts[] = line.split("\t"); return parseLine(parts); } @@ -119,6 +130,28 @@ public class ReferenceOrderedData implements } } +// private class SimpleRODIterator implements Iterator { +// //private WhitespaceTextFileParser parser = null; +// private TabbedTextFileParser parser = null; +// +// public SimpleRODIterator() { +// parser = new TabbedTextFileParser(true, file); +// } +// +// public boolean hasNext() { +// return parser.hasNext(); +// } +// +// public ROD next() { +// String parts[] = parser.next(); +// return parseLine(parts); +// } +// +// public void remove() { +// throw new UnsupportedOperationException(); +// } +// } + public class RODIterator implements Iterator { private PushbackIterator it; private ROD prev = null; diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/rodGFF.java b/java/src/org/broadinstitute/sting/gatk/refdata/rodGFF.java index a82911b57..6186d6154 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/rodGFF.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/rodGFF.java @@ -1,9 +1,14 @@ package org.broadinstitute.sting.gatk.refdata; import java.util.HashMap; +import java.util.Scanner; +import java.util.Map; +import java.util.regex.MatchResult; +import java.util.regex.Pattern; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Utils; /** * Class for representing arbitrary reference ordered data sets @@ -75,13 +80,31 @@ public class rodGFF extends ReferenceOrderedDatum { return attributes.get(key); } + public boolean containsAttribute(final String key) { + return attributes.containsKey(key); + } + + public HashMap getAttributes() { + return attributes; + } + + public String getAttributeString() { + String[] strings = new String[attributes.size()]; + int i = 0; + for ( Map.Entry pair : attributes.entrySet() ) { + strings[i++] = pair.getKey() + " " + pair.getValue(); + //strings[i++] = "(" + pair.getKey() + ") (" + pair.getValue() + ")"; + } + return Utils.join(" ; ", strings); + } + // ---------------------------------------------------------------------- // // formatting // // ---------------------------------------------------------------------- public String toString() { - return String.format("%s\t%s\t%s\t%d\t%d\t%f\t%s\t%s", contig, source, feature, start, stop, score, strand, frame); + return String.format("%s\t%s\t%s\t%d\t%d\t%f\t%s\t%s\t%s", contig, source, feature, start, stop, score, strand, frame, getAttributeString()); } public String repl() { @@ -92,6 +115,25 @@ public class rodGFF extends ReferenceOrderedDatum { return String.format("%s", feature); } + + private static Pattern GFF_DELIM = Pattern.compile("\\s+;\\s*"); + private static Pattern GFF_ATTRIBUTE_PATTERN = Pattern.compile("([A-Za-z][A-Za-z0-9_]*)((?:\\s+\\S+)+)"); + final private HashMap parseAttributes( final String attributeLine ) { + HashMap attributes = new HashMap(); + Scanner scanner = new Scanner(attributeLine); + scanner.useDelimiter(GFF_DELIM); + while ( scanner.hasNext(GFF_ATTRIBUTE_PATTERN) ) { + MatchResult result = scanner.match(); + String key = result.group(1); + String value = result.group(2).replace("\"", "").trim(); + //System.out.printf(" Adding %s / %s (total %d)%n", key, value, result.groupCount()); + attributes.put(key, value); + String n = scanner.next(); + //System.out.printf(" next is %s%n", n); + } + return attributes; + } + public void parseLine(final String[] parts) { //System.out.printf("Parsing GFFLine %s%n", Utils.join(" ", parts)); @@ -107,7 +149,8 @@ public class rodGFF extends ReferenceOrderedDatum { final String strand = parts[6]; final String frame = parts[7]; - HashMap attributes = null; + final String attributeParts = Utils.join(" ", parts, 8, parts.length); + HashMap attributes = parseAttributes(attributeParts); setValues(contig, source, feature, start, stop, score, strand, frame, attributes); } } diff --git a/java/src/org/broadinstitute/sting/utils/Utils.java b/java/src/org/broadinstitute/sting/utils/Utils.java index 09b0e86be..4a3033fc3 100755 --- a/java/src/org/broadinstitute/sting/utils/Utils.java +++ b/java/src/org/broadinstitute/sting/utils/Utils.java @@ -212,11 +212,15 @@ public class Utils { } public static String join(String separator, String[] strings) { - if (strings.length == 0) { + return join(separator, strings, 0, strings.length); + } + + public static String join(String separator, String[] strings, int start, int end) { + if ((end - start) == 0) { return ""; } - StringBuilder ret = new StringBuilder(strings[0]); - for (int i = 1; i < strings.length; ++i) { + StringBuilder ret = new StringBuilder(strings[start]); + for (int i = start+1; i < end; ++i) { ret.append(separator); ret.append(strings[i]); }