Continuing to expand support for ROD. These edits add a program, PrepareROD, that sorts and prepares RODs for consumption by the analysis TK.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@17 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2009-03-01 23:32:23 +00:00
parent fd447d7c1d
commit b19e4b502b
5 changed files with 233 additions and 24 deletions

View File

@ -0,0 +1,92 @@
package edu.mit.broad.sting.atk;
import edu.mit.broad.sam.SAMFileReader.ValidationStringency;
import edu.mit.broad.sam.SAMSequenceRecord;
import edu.mit.broad.picard.cmdline.CommandLineProgram;
import edu.mit.broad.picard.cmdline.Usage;
import edu.mit.broad.picard.cmdline.Option;
import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory;
import edu.mit.broad.picard.reference.ReferenceSequence;
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
import edu.mit.broad.sting.atk.modules.*;
import edu.mit.broad.sting.utils.*;
import java.io.*;
import java.util.HashMap;
import java.util.List;
import java.util.ArrayList;
public class PrepareROD extends CommandLineProgram {
// Usage and parameters
@Usage(programVersion="0.1") public String USAGE = "SAM Validator\n";
@Option(shortName="REF", doc="Reference sequence file") public File REF_FILE_ARG = null;
@Option(shortName="ROD", doc="Referenced Ordered Data file") public String ROD_FILE = null;
@Option(shortName="OUT", doc="Referenced Ordered Data file") public String OUTPUT_FILE = null;
@Option(shortName="RODNAME", doc="Name of the data") public String ROD_NAME = null;
@Option(shortName="RODTYPE", doc="Referenced Ordered Data type") public String ROD_TYPE = null;
public static HashMap<String, Class> Types = new HashMap<String,Class>();
public static void addModule(final String name, final Class rodType) {
System.out.printf("* Adding rod class %s%n", name);
Types.put(name.toLowerCase(), rodType);
}
static {
addModule("GFF", rodGFF.class);
addModule("dbSNP", rodDbSNP.class);
}
/** Required main method implementation. */
public static void main(String[] argv) {
System.exit(new PrepareROD().instanceMain(argv));
}
protected int doWork() {
// Prepare the sort ordering w.r.t. the sequence dictionary
final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REF_FILE_ARG);
List<SAMSequenceRecord> refContigs = refFile.getSequenceDictionary();
HashMap<String, Integer> refContigOrdering = new HashMap<String, Integer>();
ReferenceOrderedDatum.setContigOrdering(refContigOrdering);
int i = 0;
for ( SAMSequenceRecord contig : refContigs ) {
System.out.println(contig.getSequenceName());
refContigOrdering.put(contig.getSequenceName(), i);
i++;
}
Class rodClass = Types.get(ROD_TYPE.toLowerCase());
ReferenceOrderedData rod = new ReferenceOrderedData(new File(ROD_FILE), rodClass );
try {
rod.validateFile();
} catch ( Exception e ) {
//System.out.println("Validation failure: " + e);
e.printStackTrace();
}
ArrayList<ReferenceOrderedDatum> rodData = rod.readAll();
System.out.printf("Read %d elements from %s%n", rodData.size(), ROD_FILE);
ReferenceOrderedData.sortRODDataInMemory(rodData);
try {
ReferenceOrderedData.write(rodData, new File(OUTPUT_FILE));
} catch ( IOException e ) {
//System.out.println("Validation failure: " + e);
e.printStackTrace();
}
System.out.printf("Validating output file %s%n", rodData.size(), OUTPUT_FILE);
ReferenceOrderedData outputRod = new ReferenceOrderedData(new File(OUTPUT_FILE), rodClass );
try {
outputRod.validateFile();
//outputRod.hasSameContents(ROD_FILE);
} catch ( Exception e ) {
//System.out.println("Validation failure: " + e);
e.printStackTrace();
}
return 0;
}
}

View File

@ -1,7 +1,13 @@
package edu.mit.broad.sting.utils;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Iterator;
import java.util.ArrayList;
import java.util.Collections;
import edu.mit.broad.picard.util.TabbedTextFileParser;
/**
@ -41,6 +47,50 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
System.exit(1);
}
// ----------------------------------------------------------------------
//
// Manipulations of all of the data
//
// ----------------------------------------------------------------------
public ArrayList<ReferenceOrderedDatum> readAll() {
ArrayList<ReferenceOrderedDatum> elts = new ArrayList<ReferenceOrderedDatum>();
for ( ReferenceOrderedDatum rec : this ) {
elts.add(rec);
}
elts.trimToSize();
return elts;
}
public static void sortRODDataInMemory(ArrayList<ReferenceOrderedDatum> data) {
Collections.sort(data);
}
public static void write(ArrayList<ReferenceOrderedDatum> data, File output) throws IOException {
final FileWriter out = new FileWriter(output);
for ( ReferenceOrderedDatum rec : data ) {
out.write(rec.repl() + "\n");
}
out.close();
}
public boolean validateFile() throws Exception {
ReferenceOrderedDatum last = null;
for ( ReferenceOrderedDatum rec : this ) {
if ( last != null && last.compareTo(rec) == 1 ) {
// It's out of order
throw new Exception("Out of order elements at \n" + last.toString() + "\n" + rec.toString());
}
last = rec;
}
return true;
}
public void indexFile() {
// Fixme -- get access to the linear index system from Jim
}
// ----------------------------------------------------------------------
//
// Iteration

View File

@ -1,5 +1,11 @@
package edu.mit.broad.sting.utils;
import java.util.Comparator;
import java.util.HashMap;
//
// Ugly global variable defining the optional ordering of contig elements
//
/**
* Created by IntelliJ IDEA.
* User: mdepristo
@ -7,15 +13,62 @@ package edu.mit.broad.sting.utils;
* Time: 10:49:47 AM
* To change this template use File | Settings | File Templates.
*/
public abstract class ReferenceOrderedDatum {
public abstract class ReferenceOrderedDatum implements Comparable {
public static HashMap<String, Integer> refContigOrdering = null;
public static void setContigOrdering(HashMap<String, Integer> rco) {
refContigOrdering = rco;
}
public ReferenceOrderedDatum() { }
public abstract void parseLine(final String[] parts);
public abstract String toString();
public abstract String toSimpleString();
public abstract String repl();
public abstract String getContig();
public abstract long getStart();
public abstract long getStop();
public int compareTo( Object x ) {
if ( this == x ) return 0;
ReferenceOrderedDatum that = (ReferenceOrderedDatum)x;
if ( refContigOrdering != null ) {
if ( ! refContigOrdering.containsKey(this.getContig()) ) {
if ( ! refContigOrdering.containsKey(that.getContig()) ) {
// Use regular sorted order
int cmpContig = getContig().compareTo(that.getContig());
if ( cmpContig != 0 )return cmpContig;
}
else {
// this is always bigger if that is in the key set
return 1;
}
}
else if ( ! refContigOrdering.containsKey(that.getContig()) )
return -1;
else {
assert refContigOrdering.containsKey(this.getContig()) : this;
assert refContigOrdering.containsKey(that.getContig()) : that;
final int thisO = refContigOrdering.get(this.getContig());
final int thatO = refContigOrdering.get(that.getContig());
if ( thisO < thatO ) return -1;
if ( thisO > thatO ) return 1;
}
}
else {
int cmpContig = getContig().compareTo(that.getContig());
if ( cmpContig != 0 )return cmpContig;
}
if ( this.getStart() < that.getStart() ) return -1;
if ( this.getStart() > that.getStart() ) return 1;
if ( this.getStop() < that.getStop() ) return -1;
if ( this.getStop() > that.getStop() ) return 1;
return 0;
}
}

View File

@ -72,31 +72,41 @@ public class rodDbSNP extends ReferenceOrderedDatum {
return String.format("%s:%s", name, observed);
}
public String repl() {
return String.format("%d\t%s\t%d\t%d\t%s\t0\t%s\tX\tX\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d",
585, contig, start-1, stop-1, name, strand, observed, molType,
varType, validationStatus, avHet, avHetSE, func, locType, weight );
}
public void parseLine(final String[] parts) {
//System.out.printf("Parsing GFFLine %s%n", Utils.join(" <=> ", parts));
contig = parts[1];
start = Long.parseLong(parts[2]) + 1; // The final is 0 based
stop = Long.parseLong(parts[3]) + 1; // The final is 0 based
name = parts[4];
strand = parts[5];
observed = parts[9];
molType = parts[10];
varType = parts[11];
validationStatus = parts[12];
avHet = Double.parseDouble(parts[13]);
avHetSE = Double.parseDouble(parts[14]);
func = parts[15];
locType = parts[16];
weight = Integer.parseInt(parts[17]);
try {
contig = parts[1];
start = Long.parseLong(parts[2]) + 1; // The final is 0 based
// Cut up the observed bases string into an array of individual bases
String[] bases = observed.split("/");
observedBases = new char[bases.length];
for ( String elt : bases ) {
observedBases[0] = (char)elt.getBytes()[0];
//System.out.printf(" Bases %s %d %c%n", elt, elt.getBytes()[0], (char)elt.getBytes()[0]);
stop = Long.parseLong(parts[3]) + 1; // The final is 0 based
name = parts[4];
strand = parts[6];
observed = parts[9];
molType = parts[10];
varType = parts[11];
validationStatus = parts[12];
avHet = Double.parseDouble(parts[13]);
avHetSE = Double.parseDouble(parts[14]);
func = parts[15];
locType = parts[16];
weight = Integer.parseInt(parts[17]);
// Cut up the observed bases string into an array of individual bases
String[] bases = observed.split("/");
observedBases = new char[bases.length];
for ( String elt : bases ) {
observedBases[0] = (char)elt.getBytes()[0];
//System.out.printf(" Bases %s %d %c%n", elt, elt.getBytes()[0], (char)elt.getBytes()[0]);
}
//System.out.printf(" => Observed bases are %s%n", Utils.join(" B ", bases));
} catch ( RuntimeException e ) {
System.out.printf(" Exception caught during parsing GFFLine %s%n", Utils.join(" <=> ", parts));
throw e;
}
//System.out.printf(" => Observed bases are %s%n", Utils.join(" B ", bases));
}
}

View File

@ -98,6 +98,10 @@ public class rodGFF extends ReferenceOrderedDatum {
return String.format("%s\t%s\t%s\t%d\t%d\t%f\t%s\t%s", contig, source, feature, start, stop, score, strand, frame);
}
public String repl() {
return this.toString();
}
public String toSimpleString() {
return String.format("%s", feature);
}