Revised ROD system. Split the system in Basic type and interface. Enabled more control over rod accessing, including an initialize() function to fetch headers and other options from the file. Added general tabular rod, which has a named columns and supports a map<String,String> interface. Comes with shiny new Junit system for RODs. Also, added simple python script for accessing picard data.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@716 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
67293168e7
commit
8e9e2f4502
|
|
@ -0,0 +1,45 @@
|
|||
package org.broadinstitute.sting.gatk.refdata;
|
||||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mdepristo
|
||||
* Date: Feb 27, 2009
|
||||
* Time: 10:49:47 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public abstract class BasicReferenceOrderedDatum implements ReferenceOrderedDatum {
|
||||
protected String name;
|
||||
|
||||
public BasicReferenceOrderedDatum(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public String getName() { return this.name; }
|
||||
|
||||
public abstract boolean parseLine(final Object header, final String[] parts) throws IOException;
|
||||
|
||||
public abstract String toString();
|
||||
public String toSimpleString() { return toString(); }
|
||||
public String repl() { return this.toString(); }
|
||||
|
||||
public String delimiter() {
|
||||
return "\t";
|
||||
}
|
||||
|
||||
public abstract GenomeLoc getLocation();
|
||||
|
||||
public int compareTo( ReferenceOrderedDatum that ) {
|
||||
return getLocation().compareTo(that.getLocation());
|
||||
}
|
||||
|
||||
public Object initialize(final File source) throws FileNotFoundException {
|
||||
//System.out.printf("Initialize called with %s%n", source);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
|
@ -11,7 +11,7 @@ import edu.mit.broad.picard.util.SequenceUtil;
|
|||
/**
|
||||
* ReferenceOrderedDatum class to hold HapMap AlleleFrequency Data
|
||||
*/
|
||||
public class HapMapAlleleFrequenciesROD extends ReferenceOrderedDatum {
|
||||
public class HapMapAlleleFrequenciesROD extends BasicReferenceOrderedDatum {
|
||||
public GenomeLoc loc; // genome location of SNP
|
||||
// Reference sequence chromosome or scaffold
|
||||
// Start and stop positions in chrom
|
||||
|
|
@ -46,7 +46,7 @@ public class HapMapAlleleFrequenciesROD extends ReferenceOrderedDatum {
|
|||
|
||||
return String.format(
|
||||
"%s\t%s\t%s\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%1.3f\t%1.3f\t%d",
|
||||
rsNumber, hgBuild, getContig(), getStart(), strand, alleles, refAllele, varAllele,
|
||||
rsNumber, hgBuild, getLocation().getContig(), getLocation().getStart(), strand, alleles, refAllele, varAllele,
|
||||
refCounts, varCounts, refFreq, varFreq, totalCounts);
|
||||
}
|
||||
|
||||
|
|
@ -58,7 +58,7 @@ public class HapMapAlleleFrequenciesROD extends ReferenceOrderedDatum {
|
|||
return toString();
|
||||
}
|
||||
|
||||
public void parseLine(final String[] parts) {
|
||||
public boolean parseLine(final Object header, final String[] parts) {
|
||||
try {
|
||||
// rs11511647 <=> HG18 <=> chr10 <=> 62765 <=> + <=> T/C <=> T <=> C <=> 21 <=> 97 <=> 0.178 <=> 0.822 <=> 118
|
||||
|
||||
|
|
@ -85,6 +85,7 @@ public class HapMapAlleleFrequenciesROD extends ReferenceOrderedDatum {
|
|||
System.out.printf(" Exception caught during parsing HapMap Allele Freq %s%n", Utils.join(" <=> ", parts));
|
||||
throw e;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public double getVarAlleleFreq() { return this.varFreq; }
|
||||
|
|
|
|||
|
|
@ -25,6 +25,13 @@ import org.apache.log4j.Logger;
|
|||
public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements Iterable<ROD> {
|
||||
private String name;
|
||||
private File file = null;
|
||||
private String fieldDelimiter;
|
||||
|
||||
/**
|
||||
* Header object returned from the datum
|
||||
*/
|
||||
private Object header = null;
|
||||
|
||||
private Class<ROD> type = null; // runtime type information for object construction
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
|
|
@ -53,6 +60,7 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
|||
addModule("dbSNP", rodDbSNP.class);
|
||||
addModule("HapMapAlleleFrequencies", HapMapAlleleFrequenciesROD.class);
|
||||
addModule("SAMPileup", rodSAMPileup.class);
|
||||
addModule("Table", TabularROD.class);
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -124,6 +132,8 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
|||
this.file = file;
|
||||
this.type = type;
|
||||
this.name = name;
|
||||
this.header = initializeROD(name, file, type);
|
||||
this.fieldDelimiter = newROD(name, type).delimiter();
|
||||
}
|
||||
|
||||
public String getName() { return name; }
|
||||
|
|
@ -218,8 +228,9 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
|||
public ROD next() {
|
||||
final String line = parser.next();
|
||||
//System.out.printf("Line is %s%n", line);
|
||||
String parts[] = line.split("\t");
|
||||
return parseLine(parts);
|
||||
String parts[] = line.split(fieldDelimiter);
|
||||
ROD n = parseLine(parts);
|
||||
return n != null ? n : next();
|
||||
}
|
||||
|
||||
public void remove() {
|
||||
|
|
@ -263,52 +274,41 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
|||
return prev;
|
||||
}
|
||||
|
||||
//
|
||||
// Seeks forward in the file until we reach (or cross) a record at contig / pos
|
||||
// If we don't find anything and cross beyond contig / pos, we return null
|
||||
// Otherwise we return the first object who's start is at pos
|
||||
//
|
||||
/**
|
||||
* Seeks forward in the file until we reach (or cross) a record at contig / pos
|
||||
* If we don't find anything and cross beyond contig / pos, we return null
|
||||
* Otherwise we return the first object who's start is at pos
|
||||
*
|
||||
* @param loc
|
||||
* @return
|
||||
*/
|
||||
public ROD seekForward(final GenomeLoc loc) {
|
||||
return seekForward(loc.getContig(), loc.getStart());
|
||||
}
|
||||
|
||||
protected ROD seekForward(final String contigName, final long pos) {
|
||||
final boolean DEBUG = false;
|
||||
final boolean DEBUG = false;
|
||||
|
||||
ROD result = null;
|
||||
|
||||
if ( DEBUG ) System.out.printf(" *** starting seek to %s %d %s%n", contigName, pos, prev);
|
||||
if ( DEBUG ) System.out.printf(" *** starting seek to %s %d %s%n", loc.getContig(), loc.getStart(), prev);
|
||||
while ( hasNext() ) {
|
||||
ROD current = next();
|
||||
//System.out.printf(" -> Seeking to %s %d AT %s %d%n", contigName, pos, current.getContig(), current.getStart());
|
||||
int strCmp = GenomeLoc.compareContigs( contigName, prev.getContig() );// contigName.compareTo( prev.getContig() );
|
||||
if ( strCmp == 0 ) {
|
||||
// The contigs are equal
|
||||
if ( current.getStart() > pos ) {
|
||||
// There was nothing to find, push back next and return null
|
||||
it.pushback(current);
|
||||
break;
|
||||
}
|
||||
else if ( pos == current.getStart() ) {
|
||||
// We found a record at contig / pos, return it
|
||||
result = current;
|
||||
break;
|
||||
}
|
||||
int cmp = current.getLocation().compareTo(loc);
|
||||
if ( cmp < 0 ) {
|
||||
// current occurs before loc, continue searching
|
||||
continue;
|
||||
}
|
||||
else if ( strCmp < 0 ) {
|
||||
if ( DEBUG ) System.out.printf(" -> Jumped to contig %s%n", contigName);
|
||||
// We've gone past the desired contig, break
|
||||
else if ( cmp == 0 ) {
|
||||
result = current;
|
||||
break;
|
||||
} else {
|
||||
// current is after loc
|
||||
it.pushback(current);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ( DEBUG ) {
|
||||
if ( result == null )
|
||||
;
|
||||
//System.out.printf(" --- seek result to %s %d is NULL%n", contigName, pos);
|
||||
else
|
||||
System.out.printf(" ### Found %s %d%n", result.getContig(), result.getStart());
|
||||
if ( result != null )
|
||||
System.out.printf(" ### Found %s%n", result.getLocation());
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -326,26 +326,39 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
|||
// Parsing
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
ROD parseLine(final String[] parts) {
|
||||
//System.out.printf("Parsing GFFLine %s%n", Utils.join(" ", parts));
|
||||
private ROD newROD( final String name, final Class<ROD> type ) {
|
||||
try {
|
||||
//ROD obj = type.newInstance();
|
||||
Constructor<ROD> c = type.getConstructor(String.class);
|
||||
ROD obj = (ROD)c.newInstance(name);
|
||||
obj.parseLine(parts);
|
||||
return obj;
|
||||
return (ROD)c.newInstance(name);
|
||||
} catch ( java.lang.InstantiationException e ) {
|
||||
System.out.println(e);
|
||||
return null; // wow, unsafe!
|
||||
throw new RuntimeException(e);
|
||||
} catch ( java.lang.IllegalAccessException e ) {
|
||||
System.out.println(e);
|
||||
return null; // wow, unsafe!
|
||||
throw new RuntimeException(e);
|
||||
} catch ( java.lang.NoSuchMethodException e ) {
|
||||
System.out.println(e);
|
||||
return null; // wow, unsafe!
|
||||
throw new RuntimeException(e);
|
||||
} catch ( InvocationTargetException e ) {
|
||||
System.out.println(e);
|
||||
return null; // wow, unsafe!
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private Object initializeROD(final String name, final File file, final Class<ROD> type) {
|
||||
ROD rod = newROD(name, type);
|
||||
try {
|
||||
return rod.initialize(file);
|
||||
} catch ( FileNotFoundException e ) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private ROD parseLine(final String[] parts) {
|
||||
//System.out.printf("Parsing GFFLine %s%n", Utils.join(" ", parts));
|
||||
ROD obj = newROD(name, type);
|
||||
try {
|
||||
if ( ! obj.parseLine(header, parts) )
|
||||
obj = null;
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException("Badly formed ROD: " + e);
|
||||
}
|
||||
return obj;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,6 +2,10 @@ package org.broadinstitute.sting.gatk.refdata;
|
|||
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mdepristo
|
||||
|
|
@ -9,27 +13,28 @@ import org.broadinstitute.sting.utils.GenomeLoc;
|
|||
* Time: 10:49:47 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public abstract class ReferenceOrderedDatum implements Comparable<ReferenceOrderedDatum> {
|
||||
protected String name;
|
||||
public interface ReferenceOrderedDatum extends Comparable<ReferenceOrderedDatum> {
|
||||
public String getName();
|
||||
public boolean parseLine(final Object header, final String[] parts) throws IOException;
|
||||
public String toString();
|
||||
public String toSimpleString();
|
||||
public String repl();
|
||||
|
||||
public ReferenceOrderedDatum(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
/**
|
||||
* Used by the ROD system to determine how to split input lines
|
||||
* @return Regex string delimiter separating fields
|
||||
*/
|
||||
public String delimiter();
|
||||
|
||||
public String getName() { return this.name; }
|
||||
public GenomeLoc getLocation();
|
||||
public int compareTo( ReferenceOrderedDatum that );
|
||||
|
||||
public abstract void parseLine(final String[] parts);
|
||||
|
||||
public abstract String toString();
|
||||
public abstract String toSimpleString();
|
||||
public abstract String repl();
|
||||
|
||||
public abstract GenomeLoc getLocation();
|
||||
public int compareTo( ReferenceOrderedDatum that ) {
|
||||
return getLocation().compareTo(that.getLocation());
|
||||
}
|
||||
|
||||
public final String getContig() { return getLocation().getContig(); }
|
||||
public final long getStart() { return getLocation().getStart(); }
|
||||
public final long getStop() { return getLocation().getStop(); }
|
||||
/**
|
||||
* Backdoor hook to read header, meta-data, etc. associated with the file. Will be
|
||||
* called by the ROD system before streaming starts
|
||||
*
|
||||
* @param source source data file on disk from which this rod stream will be pulled
|
||||
* @return a header object that will be passed to parseLine command
|
||||
*/
|
||||
public Object initialize(final File source) throws FileNotFoundException;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,171 @@
|
|||
package org.broadinstitute.sting.gatk.refdata;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.regex.MatchResult;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.regex.Matcher;
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
import org.broadinstitute.sting.utils.Utils;
|
||||
import org.broadinstitute.sting.utils.xReadLines;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
/**
|
||||
* Class for representing arbitrary reference ordered data sets
|
||||
*
|
||||
* User: mdepristo
|
||||
* Date: Feb 27, 2009
|
||||
* Time: 10:47:14 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class TabularROD extends BasicReferenceOrderedDatum implements Map<String, String> {
|
||||
private static Logger logger = Logger.getLogger(TabularROD.class);
|
||||
|
||||
private GenomeLoc loc;
|
||||
private HashMap<String, String> attributes;
|
||||
private ArrayList<String> header;
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Constructors
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
public TabularROD(final String name) {
|
||||
super(name);
|
||||
attributes = new HashMap<String, String>();
|
||||
}
|
||||
|
||||
/**
|
||||
* Walks through the source files looking for the header line, which it returns as a
|
||||
* list of strings.
|
||||
*
|
||||
* @param source
|
||||
* @return
|
||||
*/
|
||||
public Object initialize(final File source) throws FileNotFoundException {
|
||||
List<String> header = null;
|
||||
int linesLookedAt = 0;
|
||||
xReadLines reader = new xReadLines(source);
|
||||
|
||||
for ( String line : reader ) {
|
||||
Matcher m = HEADER_PATTERN.matcher(line);
|
||||
if ( m.matches() ) {
|
||||
//System.out.printf("Found a header line: %s%n", line);
|
||||
header = new ArrayList<String>(Arrays.asList(line.split("\\s+")));
|
||||
}
|
||||
|
||||
if ( linesLookedAt++ > MAX_LINES_TO_LOOK_FOR_HEADER )
|
||||
break;
|
||||
}
|
||||
|
||||
// check that we found the header
|
||||
if ( header != null ) {
|
||||
logger.debug(String.format("HEADER IS %s%n", Utils.join(":", header)));
|
||||
} else {
|
||||
throw new RuntimeException("Couldn't find header line in TabularROD!");
|
||||
}
|
||||
|
||||
//System.exit(1);
|
||||
return header;
|
||||
}
|
||||
|
||||
private static int MAX_LINES_TO_LOOK_FOR_HEADER = 1000;
|
||||
private static Pattern HEADER_PATTERN = Pattern.compile("^\\s*HEADER.*");
|
||||
private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*");
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Accessors
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
public GenomeLoc getLocation() {
|
||||
return loc;
|
||||
}
|
||||
|
||||
public String get(final Object key) {
|
||||
return attributes.get(key);
|
||||
}
|
||||
|
||||
public String put(final String key, final String object) {
|
||||
return attributes.put(key, object);
|
||||
}
|
||||
|
||||
public boolean containsKey(final Object key) {
|
||||
return attributes.containsKey(key);
|
||||
}
|
||||
|
||||
public HashMap<String,String> getAttributes() {
|
||||
return attributes;
|
||||
}
|
||||
|
||||
public String getAttributeString() {
|
||||
List<String> strings = new ArrayList<String>(attributes.size());
|
||||
for ( String key : header ) {
|
||||
if ( containsKey(key) ) { // avoid the header
|
||||
strings.add(this.get(key));
|
||||
System.out.printf("Adding %s%n", this.get(key));
|
||||
}
|
||||
}
|
||||
return Utils.join("\t", strings);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// map functions
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
public int size() { return attributes.size(); }
|
||||
public boolean isEmpty() { return attributes.isEmpty(); }
|
||||
public boolean containsValue(Object o) { return attributes.containsValue(o); }
|
||||
public String remove(Object o) { return attributes.remove(o); }
|
||||
public void clear() { attributes.clear(); }
|
||||
public java.util.Set<String> keySet() { return attributes.keySet(); }
|
||||
public java.util.Collection<String> values() { return attributes.values(); }
|
||||
|
||||
public void putAll(java.util.Map<? extends String, ? extends String> map) {
|
||||
attributes.putAll(map);
|
||||
}
|
||||
|
||||
public java.util.Set<java.util.Map.Entry<String,String>> entrySet() {
|
||||
return attributes.entrySet();
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// formatting
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
public String toString() {
|
||||
return String.format("%s\t%s", getLocation(), getAttributeString());
|
||||
}
|
||||
|
||||
public String delimiter() {
|
||||
return "\\s+";
|
||||
}
|
||||
|
||||
public boolean parseLine(final Object headerObj, final String[] parts) throws IOException {
|
||||
header = (ArrayList<String>)(headerObj);
|
||||
|
||||
//System.out.printf("parts [len=%d] is '%s'%n", parts.length, Utils.join(":", parts));
|
||||
|
||||
if ( parts.length == 0 || COMMENT_PATTERN.matcher(parts[0]).matches() || HEADER_PATTERN.matcher(parts[0]).matches() )
|
||||
return false;
|
||||
|
||||
if (header.size() != parts.length) {
|
||||
throw new IOException(String.format("Header length %d not equal to Tabular parts length %d", header.size(), parts.length));
|
||||
}
|
||||
|
||||
loc = GenomeLoc.parseGenomeLoc(parts[0]);
|
||||
for ( int i = 1; i < parts.length; i++ ) {
|
||||
put(header.get(i), parts[i]);
|
||||
}
|
||||
|
||||
//System.out.printf("Parsed %s%n", this);
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
@ -19,7 +19,7 @@ import org.broadinstitute.sting.gatk.refdata.AllelicVariant;
|
|||
* Time: 10:47:14 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class rodDbSNP extends ReferenceOrderedDatum implements AllelicVariant {
|
||||
public class rodDbSNP extends BasicReferenceOrderedDatum implements AllelicVariant {
|
||||
public GenomeLoc loc; // genome location of SNP
|
||||
// Reference sequence chromosome or scaffold
|
||||
// Start and stop positions in chrom
|
||||
|
|
@ -128,7 +128,8 @@ public class rodDbSNP extends ReferenceOrderedDatum implements AllelicVariant {
|
|||
// ----------------------------------------------------------------------
|
||||
public String toString() {
|
||||
return String.format("%s\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d",
|
||||
getContig(), getStart(), getStop(), name, strand, refBases, observed, molType,
|
||||
getLocation().getContig(), getLocation().getStart(), getLocation().getStop(),
|
||||
name, strand, refBases, observed, molType,
|
||||
varType, validationStatus, avHet, avHetSE, func, locType, weight );
|
||||
}
|
||||
|
||||
|
|
@ -147,11 +148,12 @@ public class rodDbSNP extends ReferenceOrderedDatum implements AllelicVariant {
|
|||
|
||||
public String repl() {
|
||||
return String.format("%d\t%s\t%d\t%d\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d",
|
||||
585, getContig(), getStart()-1, getStop()-1, name, strand, refBases, refBases, observed, molType,
|
||||
585, getLocation().getContig(), getLocation().getStart()-1, getLocation().getStop()-1,
|
||||
name, strand, refBases, refBases, observed, molType,
|
||||
varType, validationStatus, avHet, avHetSE, func, locType, weight );
|
||||
}
|
||||
|
||||
public void parseLine(final String[] parts) {
|
||||
public boolean parseLine(final Object header, final String[] parts) {
|
||||
try {
|
||||
String contig = parts[1];
|
||||
long start = Long.parseLong(parts[2]) + 1; // The final is 0 based
|
||||
|
|
@ -170,6 +172,7 @@ public class rodDbSNP extends ReferenceOrderedDatum implements AllelicVariant {
|
|||
func = parts[15];
|
||||
locType = parts[16];
|
||||
weight = Integer.parseInt(parts[17]);
|
||||
return true;
|
||||
} catch ( RuntimeException e ) {
|
||||
System.out.printf(" Exception caught during parsing GFFLine %s%n", Utils.join(" <=> ", parts));
|
||||
throw e;
|
||||
|
|
|
|||
|
|
@ -18,7 +18,7 @@ import org.broadinstitute.sting.utils.Utils;
|
|||
* Time: 10:47:14 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class rodGFF extends ReferenceOrderedDatum {
|
||||
public class rodGFF extends BasicReferenceOrderedDatum {
|
||||
private String contig, source, feature, strand, frame;
|
||||
private long start, stop;
|
||||
private double score;
|
||||
|
|
@ -134,7 +134,7 @@ public class rodGFF extends ReferenceOrderedDatum {
|
|||
return attributes;
|
||||
}
|
||||
|
||||
public void parseLine(final String[] parts) {
|
||||
public boolean parseLine(final Object header, final String[] parts) {
|
||||
//System.out.printf("Parsing GFFLine %s%n", Utils.join(" ", parts));
|
||||
|
||||
final String contig = parts[0];
|
||||
|
|
@ -152,5 +152,6 @@ public class rodGFF extends ReferenceOrderedDatum {
|
|||
final String attributeParts = Utils.join(" ", parts, 8, parts.length);
|
||||
HashMap<String, String> attributes = parseAttributes(attributeParts);
|
||||
setValues(contig, source, feature, start, stop, score, strand, frame, attributes);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,7 +25,7 @@ import org.broadinstitute.sting.utils.Pileup;
|
|||
* Time: 2:58:33 PM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public class rodSAMPileup extends ReferenceOrderedDatum implements Genotype, Pileup {
|
||||
public class rodSAMPileup extends BasicReferenceOrderedDatum implements Genotype, Pileup {
|
||||
private static final int NO_VARIANT = -1;
|
||||
private static final int SNP_VARIANT = 0;
|
||||
private static final int INSERTION_VARIANT = 1;
|
||||
|
|
@ -69,7 +69,7 @@ public class rodSAMPileup extends ReferenceOrderedDatum implements Genotype, Pil
|
|||
}
|
||||
|
||||
@Override
|
||||
public void parseLine(final String[] parts) {
|
||||
public boolean parseLine(final Object header, final String[] parts) {
|
||||
// 0 1 2 3 4 5 6 7
|
||||
// * chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow)
|
||||
// * chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12 6
|
||||
|
|
@ -134,6 +134,7 @@ public class rodSAMPileup extends ReferenceOrderedDatum implements Genotype, Pil
|
|||
System.out.printf(" Exception caught during parsing BasicPileup line: %s%n", Utils.join(" <=> ", parts));
|
||||
throw e;
|
||||
}
|
||||
return true;
|
||||
// if ( nNonref > 1 ) System.out.println("SAM pileup: WARNING: multi-allelic variant : ("+refBaseChar+") -->"+toMediumString());
|
||||
}
|
||||
|
||||
|
|
@ -319,7 +320,7 @@ public class rodSAMPileup extends ReferenceOrderedDatum implements Genotype, Pil
|
|||
// ----------------------------------------------------------------------
|
||||
public String toString() {
|
||||
return String.format("%s\t%d\t%d\t%s\t%s\t%s",
|
||||
getContig(), getStart(), getStop(), name, refBases, observedString);
|
||||
getLocation().getContig(), getLocation().getStart(), getLocation().getStop(), name, refBases, observedString);
|
||||
}
|
||||
|
||||
public String toSimpleString() {
|
||||
|
|
|
|||
|
|
@ -51,7 +51,7 @@ public class DupUtils {
|
|||
copy.setInferredInsertSize(read.getInferredInsertSize());
|
||||
copy.setMappingQuality(read.getMappingQuality());
|
||||
copy.setCigar(read.getCigar());
|
||||
copy.setFlags(copy.getFlags());
|
||||
copy.setFlags(read.getFlags());
|
||||
|
||||
return copy;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -42,7 +42,8 @@ public abstract class BaseTest {
|
|||
|
||||
protected static String seqLocation = "/seq";
|
||||
protected static String oneKGLocation = "/broad/1KG";
|
||||
|
||||
protected static String testDir = "testdata/";
|
||||
|
||||
protected static boolean alreadySetup = false;
|
||||
private static long startTime;
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,121 @@
|
|||
// our package
|
||||
package org.broadinstitute.sting.gatk.refdata;
|
||||
|
||||
|
||||
// the imports for unit testing.
|
||||
|
||||
import org.junit.*;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import org.broadinstitute.sting.BaseTest;
|
||||
import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2;
|
||||
import org.broadinstitute.sting.utils.RefHanger;
|
||||
import org.broadinstitute.sting.utils.GenomeLoc;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Basic unit test for TabularROD
|
||||
*
|
||||
*/
|
||||
public class TabularRODTest extends BaseTest {
|
||||
private static FastaSequenceFile2 seq;
|
||||
private ReferenceOrderedData ROD;
|
||||
private ReferenceOrderedData.RODIterator iter;
|
||||
private ReferenceOrderedData ROD2;
|
||||
private ReferenceOrderedData.RODIterator iter2;
|
||||
|
||||
@BeforeClass
|
||||
public static void init() {
|
||||
// sequence
|
||||
seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"));
|
||||
GenomeLoc.setupRefContigOrdering(seq);
|
||||
}
|
||||
|
||||
@Before
|
||||
public void setupTabularROD() {
|
||||
File file = new File(testDir + "TabularDataTest.dat");
|
||||
ROD = new ReferenceOrderedData("tableTest", file, TabularROD.class);
|
||||
iter = ROD.iterator();
|
||||
|
||||
File file2 = new File(testDir + "TabularDataTest2.dat");
|
||||
ROD2 = new ReferenceOrderedData("tableTest", file2, TabularROD.class);
|
||||
iter2 = ROD2.iterator();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test1() {
|
||||
logger.warn("Executing test1");
|
||||
TabularROD one = (TabularROD)iter.next();
|
||||
assertTrue(one.size() == 3);
|
||||
assertTrue(one.getLocation().equals(new GenomeLoc("chrM", 10)));
|
||||
assertTrue(one.get("COL1").equals("A"));
|
||||
assertTrue(one.get("COL2").equals("B"));
|
||||
assertTrue(one.get("COL3").equals("C"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test2() {
|
||||
logger.warn("Executing test2");
|
||||
TabularROD one = (TabularROD)iter.next();
|
||||
TabularROD two = (TabularROD)iter.next();
|
||||
assertTrue(two.size() == 3);
|
||||
assertTrue(two.getLocation().equals(new GenomeLoc("chrM", 20)));
|
||||
assertTrue(two.get("COL1").equals("C"));
|
||||
assertTrue(two.get("COL2").equals("D"));
|
||||
assertTrue(two.get("COL3").equals("E"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test3() {
|
||||
logger.warn("Executing test3");
|
||||
TabularROD one = (TabularROD)iter.next();
|
||||
TabularROD two = (TabularROD)iter.next();
|
||||
TabularROD three = (TabularROD)iter.next();
|
||||
assertTrue(three.size() == 3);
|
||||
assertTrue(three.getLocation().equals(new GenomeLoc("chrM", 30)));
|
||||
assertTrue(three.get("COL1").equals("F"));
|
||||
assertTrue(three.get("COL2").equals("G"));
|
||||
assertTrue(three.get("COL3").equals("H"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDone() {
|
||||
logger.warn("Executing testDone");
|
||||
TabularROD one = (TabularROD)iter.next();
|
||||
TabularROD two = (TabularROD)iter.next();
|
||||
TabularROD three = (TabularROD)iter.next();
|
||||
assertTrue(!iter.hasNext());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSeek() {
|
||||
logger.warn("Executing testSeek");
|
||||
TabularROD two = (TabularROD)iter.seekForward(new GenomeLoc("chrM", 20));
|
||||
assertTrue(two.size() == 3);
|
||||
assertTrue(two.getLocation().equals(new GenomeLoc("chrM", 20)));
|
||||
assertTrue(two.get("COL1").equals("C"));
|
||||
assertTrue(two.get("COL2").equals("D"));
|
||||
assertTrue(two.get("COL3").equals("E"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testToString() {
|
||||
logger.warn("Executing testToString");
|
||||
TabularROD one = (TabularROD)iter.next();
|
||||
assertTrue(one.toString().equals("chrM:10\tA\tB\tC"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void test2p1() {
|
||||
logger.warn("Executing test2p1");
|
||||
TabularROD one2 = (TabularROD)iter2.next();
|
||||
assertTrue(one2.size() == 4);
|
||||
assertTrue(one2.getLocation().equals(new GenomeLoc("chrM", 10)));
|
||||
assertTrue(one2.get("COL1").equals("A"));
|
||||
assertTrue(one2.get("COL2").equals("B"));
|
||||
assertTrue(one2.get("COL3").equals("C"));
|
||||
assertTrue(one2.get("COL4").equals("1"));
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,51 @@
|
|||
import farm_commands
|
||||
import os.path
|
||||
import sys
|
||||
from optparse import OptionParser
|
||||
import string
|
||||
import re
|
||||
import glob
|
||||
|
||||
#lanes = ["30JW3AAXX.6", "30KRNAAXX.1", "30KRNAAXX.6", "30PYMAAXX.5"]
|
||||
#idsList = ['NA12843', 'NA19065', 'NA19064', 'NA18637']
|
||||
|
||||
lanes = ["30JW3AAXX.6", "30PYMAAXX.5", "30PNUAAXX.8", "30PPJAAXX.5"]
|
||||
idsList = ['NA12843', 'NA18637', "NA19058", "NA12842"]
|
||||
ids = dict(zip(lanes, idsList))
|
||||
gatkPath = "~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar"
|
||||
ref = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta"
|
||||
analysis = "CombineDuplicates"
|
||||
|
||||
def getPicardPath(lane, picardRoot = '/seq/picard/'):
|
||||
flowcell, laneNo = lane.split('.')
|
||||
filePat = os.path.join(picardRoot, flowcell, '*', laneNo, '*')
|
||||
dirs = glob.glob(filePat)
|
||||
print dirs
|
||||
if len(dirs) > 1:
|
||||
system.exit("Bad lane -- too many directories matching pattern " + filePat)
|
||||
return dirs[0]
|
||||
|
||||
def getReferenceGenotypeFileFromConcordanceFile(concordFile):
|
||||
# REFERENCE_GENOTYPES=/seq/references/reference_genotypes/hapmap/Homo_sapiens_assembly18/NA19058.geli
|
||||
p = re.compile('REFERENCE_GENOTYPES=([/.\w]+)')
|
||||
for line in open(concordFile):
|
||||
match = p.search(line)
|
||||
print 'Match is', line, match
|
||||
if match <> None:
|
||||
return match.group(1)
|
||||
return None
|
||||
|
||||
def concord(options, geli, output, genotypeFile):
|
||||
return ("java -jar /seq/software/picard/current/bin/CollectGenotypeConcordanceStatistics.jar OPTIONS_FILE=%s INPUT=%s OUTPUT=%s REFERENCE_GENOTYPES=%s MINIMUM_LOD=5.0" % ( options, geli, output, genotypeFile ) )
|
||||
|
||||
def readPicardConcordance(file):
|
||||
p = re.compile('HOMOZYGOUS_REFERENCE|HETEROZYGOUS|HOMOZYGOUS_NON_REFERENCE')
|
||||
# CATEGORY OBSERVATIONS AGREE DISAGREE PCT_CONCORDANCE
|
||||
# HOMOZYGOUS_REFERENCE 853 853 0 1
|
||||
# HETEROZYGOUS 416 413 3 0.992788
|
||||
# HOMOZYGOUS_NON_REFERENCE 235 231 4 0.982979
|
||||
types = [str, int, int, int, float]
|
||||
def parse1(line):
|
||||
return [f(x) for f, x in zip(types, line.split())]
|
||||
data = [parse1(line) for line in open(file) if p.match(line) <> None]
|
||||
return data
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
# comment
|
||||
# comment line
|
||||
#
|
||||
HEADER COL1 COL2 COL3
|
||||
chrM:10 A B C
|
||||
chrM:20 C D E
|
||||
chrM:30 F G H
|
||||
|
|
@ -0,0 +1,7 @@
|
|||
# comment
|
||||
# comment line
|
||||
#
|
||||
HEADER COL1 COL2 COL3 COL4
|
||||
chrM:10 A B C 1
|
||||
chrM:20 C D E 2
|
||||
chrM:30 F G H 3
|
||||
Loading…
Reference in New Issue