2009-03-16 06:37:20 +08:00
|
|
|
package org.broadinstitute.sting.gatk.refdata;
|
|
|
|
|
|
|
|
|
|
import java.util.HashMap;
|
2009-04-02 06:54:38 +08:00
|
|
|
import java.util.Scanner;
|
|
|
|
|
import java.util.Map;
|
|
|
|
|
import java.util.regex.MatchResult;
|
|
|
|
|
import java.util.regex.Pattern;
|
2009-03-16 06:37:20 +08:00
|
|
|
|
|
|
|
|
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
|
|
|
|
import org.broadinstitute.sting.utils.GenomeLoc;
|
2009-04-02 06:54:38 +08:00
|
|
|
import org.broadinstitute.sting.utils.Utils;
|
2009-03-16 06:37:20 +08:00
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Class for representing arbitrary reference ordered data sets
|
|
|
|
|
*
|
|
|
|
|
* User: mdepristo
|
|
|
|
|
* Date: Feb 27, 2009
|
|
|
|
|
* Time: 10:47:14 AM
|
|
|
|
|
* To change this template use File | Settings | File Templates.
|
|
|
|
|
*/
|
2009-05-15 05:06:28 +08:00
|
|
|
public class rodGFF extends BasicReferenceOrderedDatum {
|
2009-03-16 06:37:20 +08:00
|
|
|
private String contig, source, feature, strand, frame;
|
|
|
|
|
private long start, stop;
|
|
|
|
|
private double score;
|
|
|
|
|
private HashMap<String, String> attributes;
|
|
|
|
|
|
|
|
|
|
// ----------------------------------------------------------------------
|
|
|
|
|
//
|
|
|
|
|
// Constructors
|
|
|
|
|
//
|
|
|
|
|
// ----------------------------------------------------------------------
|
2009-04-04 00:41:33 +08:00
|
|
|
public rodGFF(final String name) {
|
|
|
|
|
super(name);
|
2009-03-16 06:37:20 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public void setValues(final String contig, final String source, final String feature,
|
|
|
|
|
final long start, final long stop, final double score,
|
|
|
|
|
final String strand, final String frame, HashMap<String, String> attributes) {
|
|
|
|
|
this.contig = contig;
|
|
|
|
|
this.source = source;
|
|
|
|
|
this.feature = feature;
|
|
|
|
|
this.start = start;
|
|
|
|
|
this.stop= stop;
|
|
|
|
|
this.score = score;
|
|
|
|
|
this.strand = strand;
|
|
|
|
|
this.frame = frame;
|
|
|
|
|
this.attributes = attributes;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ----------------------------------------------------------------------
|
|
|
|
|
//
|
|
|
|
|
// Accessors
|
|
|
|
|
//
|
|
|
|
|
// ----------------------------------------------------------------------
|
|
|
|
|
public String getSource() {
|
|
|
|
|
return source;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public String getFeature() {
|
|
|
|
|
return feature;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public String getStrand() {
|
|
|
|
|
return strand;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public String getFrame() {
|
|
|
|
|
return frame;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public double getScore() {
|
|
|
|
|
return score;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public GenomeLoc getLocation() {
|
|
|
|
|
return new GenomeLoc(contig, start, stop);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public String getAttribute(final String key) {
|
|
|
|
|
return attributes.get(key);
|
|
|
|
|
}
|
|
|
|
|
|
2009-04-02 06:54:38 +08:00
|
|
|
public boolean containsAttribute(final String key) {
|
|
|
|
|
return attributes.containsKey(key);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public HashMap<String,String> getAttributes() {
|
|
|
|
|
return attributes;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public String getAttributeString() {
|
|
|
|
|
String[] strings = new String[attributes.size()];
|
|
|
|
|
int i = 0;
|
|
|
|
|
for ( Map.Entry<String, String> pair : attributes.entrySet() ) {
|
|
|
|
|
strings[i++] = pair.getKey() + " " + pair.getValue();
|
|
|
|
|
//strings[i++] = "(" + pair.getKey() + ") (" + pair.getValue() + ")";
|
|
|
|
|
}
|
|
|
|
|
return Utils.join(" ; ", strings);
|
|
|
|
|
}
|
|
|
|
|
|
2009-03-16 06:37:20 +08:00
|
|
|
// ----------------------------------------------------------------------
|
|
|
|
|
//
|
|
|
|
|
// formatting
|
|
|
|
|
//
|
|
|
|
|
// ----------------------------------------------------------------------
|
|
|
|
|
public String toString() {
|
2009-04-02 06:54:38 +08:00
|
|
|
return String.format("%s\t%s\t%s\t%d\t%d\t%f\t%s\t%s\t%s", contig, source, feature, start, stop, score, strand, frame, getAttributeString());
|
2009-03-16 06:37:20 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public String repl() {
|
|
|
|
|
return this.toString();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public String toSimpleString() {
|
|
|
|
|
return String.format("%s", feature);
|
|
|
|
|
}
|
|
|
|
|
|
2009-04-02 06:54:38 +08:00
|
|
|
|
|
|
|
|
private static Pattern GFF_DELIM = Pattern.compile("\\s+;\\s*");
|
|
|
|
|
private static Pattern GFF_ATTRIBUTE_PATTERN = Pattern.compile("([A-Za-z][A-Za-z0-9_]*)((?:\\s+\\S+)+)");
|
|
|
|
|
final private HashMap<String, String> parseAttributes( final String attributeLine ) {
|
|
|
|
|
HashMap<String, String> attributes = new HashMap<String, String>();
|
|
|
|
|
Scanner scanner = new Scanner(attributeLine);
|
|
|
|
|
scanner.useDelimiter(GFF_DELIM);
|
|
|
|
|
while ( scanner.hasNext(GFF_ATTRIBUTE_PATTERN) ) {
|
|
|
|
|
MatchResult result = scanner.match();
|
|
|
|
|
String key = result.group(1);
|
|
|
|
|
String value = result.group(2).replace("\"", "").trim();
|
|
|
|
|
//System.out.printf(" Adding %s / %s (total %d)%n", key, value, result.groupCount());
|
|
|
|
|
attributes.put(key, value);
|
|
|
|
|
String n = scanner.next();
|
|
|
|
|
//System.out.printf(" next is %s%n", n);
|
|
|
|
|
}
|
|
|
|
|
return attributes;
|
|
|
|
|
}
|
|
|
|
|
|
2009-05-15 05:06:28 +08:00
|
|
|
public boolean parseLine(final Object header, final String[] parts) {
|
2009-03-16 06:37:20 +08:00
|
|
|
//System.out.printf("Parsing GFFLine %s%n", Utils.join(" ", parts));
|
|
|
|
|
|
|
|
|
|
final String contig = parts[0];
|
|
|
|
|
final String source = parts[1];
|
|
|
|
|
final String feature = parts[2];
|
|
|
|
|
final long start = Long.parseLong(parts[3]);
|
|
|
|
|
final long stop = Long.parseLong(parts[4]);
|
|
|
|
|
|
|
|
|
|
double score = Double.NaN;
|
|
|
|
|
if ( ! parts[5].equals(".") )
|
|
|
|
|
score = Double.parseDouble(parts[5]);
|
|
|
|
|
|
|
|
|
|
final String strand = parts[6];
|
|
|
|
|
final String frame = parts[7];
|
2009-04-02 06:54:38 +08:00
|
|
|
final String attributeParts = Utils.join(" ", parts, 8, parts.length);
|
|
|
|
|
HashMap<String, String> attributes = parseAttributes(attributeParts);
|
2009-03-16 06:37:20 +08:00
|
|
|
setValues(contig, source, feature, start, stop, score, strand, frame, attributes);
|
2009-05-15 05:06:28 +08:00
|
|
|
return true;
|
2009-03-16 06:37:20 +08:00
|
|
|
}
|
|
|
|
|
}
|