Continuing to expand support for ROD. These edits add a program, PrepareROD, that sorts and prepares RODs for consumption by the analysis TK.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@17 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
fd447d7c1d
commit
b19e4b502b
|
|
@ -0,0 +1,92 @@
|
||||||
|
package edu.mit.broad.sting.atk;
|
||||||
|
|
||||||
|
import edu.mit.broad.sam.SAMFileReader.ValidationStringency;
|
||||||
|
import edu.mit.broad.sam.SAMSequenceRecord;
|
||||||
|
import edu.mit.broad.picard.cmdline.CommandLineProgram;
|
||||||
|
import edu.mit.broad.picard.cmdline.Usage;
|
||||||
|
import edu.mit.broad.picard.cmdline.Option;
|
||||||
|
import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory;
|
||||||
|
import edu.mit.broad.picard.reference.ReferenceSequence;
|
||||||
|
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
|
||||||
|
|
||||||
|
import edu.mit.broad.sting.atk.modules.*;
|
||||||
|
import edu.mit.broad.sting.utils.*;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
|
||||||
|
public class PrepareROD extends CommandLineProgram {
|
||||||
|
// Usage and parameters
|
||||||
|
@Usage(programVersion="0.1") public String USAGE = "SAM Validator\n";
|
||||||
|
@Option(shortName="REF", doc="Reference sequence file") public File REF_FILE_ARG = null;
|
||||||
|
@Option(shortName="ROD", doc="Referenced Ordered Data file") public String ROD_FILE = null;
|
||||||
|
@Option(shortName="OUT", doc="Referenced Ordered Data file") public String OUTPUT_FILE = null;
|
||||||
|
@Option(shortName="RODNAME", doc="Name of the data") public String ROD_NAME = null;
|
||||||
|
@Option(shortName="RODTYPE", doc="Referenced Ordered Data type") public String ROD_TYPE = null;
|
||||||
|
|
||||||
|
public static HashMap<String, Class> Types = new HashMap<String,Class>();
|
||||||
|
public static void addModule(final String name, final Class rodType) {
|
||||||
|
System.out.printf("* Adding rod class %s%n", name);
|
||||||
|
Types.put(name.toLowerCase(), rodType);
|
||||||
|
}
|
||||||
|
|
||||||
|
static {
|
||||||
|
addModule("GFF", rodGFF.class);
|
||||||
|
addModule("dbSNP", rodDbSNP.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Required main method implementation. */
|
||||||
|
public static void main(String[] argv) {
|
||||||
|
System.exit(new PrepareROD().instanceMain(argv));
|
||||||
|
}
|
||||||
|
|
||||||
|
protected int doWork() {
|
||||||
|
|
||||||
|
// Prepare the sort ordering w.r.t. the sequence dictionary
|
||||||
|
final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REF_FILE_ARG);
|
||||||
|
List<SAMSequenceRecord> refContigs = refFile.getSequenceDictionary();
|
||||||
|
HashMap<String, Integer> refContigOrdering = new HashMap<String, Integer>();
|
||||||
|
ReferenceOrderedDatum.setContigOrdering(refContigOrdering);
|
||||||
|
|
||||||
|
int i = 0;
|
||||||
|
for ( SAMSequenceRecord contig : refContigs ) {
|
||||||
|
System.out.println(contig.getSequenceName());
|
||||||
|
refContigOrdering.put(contig.getSequenceName(), i);
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
Class rodClass = Types.get(ROD_TYPE.toLowerCase());
|
||||||
|
|
||||||
|
ReferenceOrderedData rod = new ReferenceOrderedData(new File(ROD_FILE), rodClass );
|
||||||
|
try {
|
||||||
|
rod.validateFile();
|
||||||
|
} catch ( Exception e ) {
|
||||||
|
//System.out.println("Validation failure: " + e);
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
|
||||||
|
ArrayList<ReferenceOrderedDatum> rodData = rod.readAll();
|
||||||
|
System.out.printf("Read %d elements from %s%n", rodData.size(), ROD_FILE);
|
||||||
|
ReferenceOrderedData.sortRODDataInMemory(rodData);
|
||||||
|
try {
|
||||||
|
ReferenceOrderedData.write(rodData, new File(OUTPUT_FILE));
|
||||||
|
} catch ( IOException e ) {
|
||||||
|
//System.out.println("Validation failure: " + e);
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
|
||||||
|
System.out.printf("Validating output file %s%n", rodData.size(), OUTPUT_FILE);
|
||||||
|
ReferenceOrderedData outputRod = new ReferenceOrderedData(new File(OUTPUT_FILE), rodClass );
|
||||||
|
try {
|
||||||
|
outputRod.validateFile();
|
||||||
|
//outputRod.hasSameContents(ROD_FILE);
|
||||||
|
} catch ( Exception e ) {
|
||||||
|
//System.out.println("Validation failure: " + e);
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -1,7 +1,13 @@
|
||||||
package edu.mit.broad.sting.utils;
|
package edu.mit.broad.sting.utils;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
|
import java.io.FileOutputStream;
|
||||||
|
import java.io.FileWriter;
|
||||||
|
import java.io.IOException;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collections;
|
||||||
|
|
||||||
import edu.mit.broad.picard.util.TabbedTextFileParser;
|
import edu.mit.broad.picard.util.TabbedTextFileParser;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
@ -41,6 +47,50 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
||||||
System.exit(1);
|
System.exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// Manipulations of all of the data
|
||||||
|
//
|
||||||
|
// ----------------------------------------------------------------------
|
||||||
|
public ArrayList<ReferenceOrderedDatum> readAll() {
|
||||||
|
ArrayList<ReferenceOrderedDatum> elts = new ArrayList<ReferenceOrderedDatum>();
|
||||||
|
for ( ReferenceOrderedDatum rec : this ) {
|
||||||
|
elts.add(rec);
|
||||||
|
}
|
||||||
|
elts.trimToSize();
|
||||||
|
return elts;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void sortRODDataInMemory(ArrayList<ReferenceOrderedDatum> data) {
|
||||||
|
Collections.sort(data);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void write(ArrayList<ReferenceOrderedDatum> data, File output) throws IOException {
|
||||||
|
final FileWriter out = new FileWriter(output);
|
||||||
|
|
||||||
|
for ( ReferenceOrderedDatum rec : data ) {
|
||||||
|
out.write(rec.repl() + "\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
out.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean validateFile() throws Exception {
|
||||||
|
ReferenceOrderedDatum last = null;
|
||||||
|
for ( ReferenceOrderedDatum rec : this ) {
|
||||||
|
if ( last != null && last.compareTo(rec) == 1 ) {
|
||||||
|
// It's out of order
|
||||||
|
throw new Exception("Out of order elements at \n" + last.toString() + "\n" + rec.toString());
|
||||||
|
}
|
||||||
|
last = rec;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void indexFile() {
|
||||||
|
// Fixme -- get access to the linear index system from Jim
|
||||||
|
}
|
||||||
|
|
||||||
// ----------------------------------------------------------------------
|
// ----------------------------------------------------------------------
|
||||||
//
|
//
|
||||||
// Iteration
|
// Iteration
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,11 @@
|
||||||
package edu.mit.broad.sting.utils;
|
package edu.mit.broad.sting.utils;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
|
import java.util.HashMap;
|
||||||
|
|
||||||
|
//
|
||||||
|
// Ugly global variable defining the optional ordering of contig elements
|
||||||
|
//
|
||||||
/**
|
/**
|
||||||
* Created by IntelliJ IDEA.
|
* Created by IntelliJ IDEA.
|
||||||
* User: mdepristo
|
* User: mdepristo
|
||||||
|
|
@ -7,15 +13,62 @@ package edu.mit.broad.sting.utils;
|
||||||
* Time: 10:49:47 AM
|
* Time: 10:49:47 AM
|
||||||
* To change this template use File | Settings | File Templates.
|
* To change this template use File | Settings | File Templates.
|
||||||
*/
|
*/
|
||||||
public abstract class ReferenceOrderedDatum {
|
public abstract class ReferenceOrderedDatum implements Comparable {
|
||||||
|
public static HashMap<String, Integer> refContigOrdering = null;
|
||||||
|
public static void setContigOrdering(HashMap<String, Integer> rco) {
|
||||||
|
refContigOrdering = rco;
|
||||||
|
}
|
||||||
|
|
||||||
public ReferenceOrderedDatum() { }
|
public ReferenceOrderedDatum() { }
|
||||||
|
|
||||||
public abstract void parseLine(final String[] parts);
|
public abstract void parseLine(final String[] parts);
|
||||||
|
|
||||||
public abstract String toString();
|
public abstract String toString();
|
||||||
public abstract String toSimpleString();
|
public abstract String toSimpleString();
|
||||||
|
public abstract String repl();
|
||||||
|
|
||||||
public abstract String getContig();
|
public abstract String getContig();
|
||||||
public abstract long getStart();
|
public abstract long getStart();
|
||||||
public abstract long getStop();
|
public abstract long getStop();
|
||||||
|
|
||||||
|
public int compareTo( Object x ) {
|
||||||
|
if ( this == x ) return 0;
|
||||||
|
|
||||||
|
ReferenceOrderedDatum that = (ReferenceOrderedDatum)x;
|
||||||
|
|
||||||
|
if ( refContigOrdering != null ) {
|
||||||
|
if ( ! refContigOrdering.containsKey(this.getContig()) ) {
|
||||||
|
if ( ! refContigOrdering.containsKey(that.getContig()) ) {
|
||||||
|
// Use regular sorted order
|
||||||
|
int cmpContig = getContig().compareTo(that.getContig());
|
||||||
|
if ( cmpContig != 0 )return cmpContig;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
// this is always bigger if that is in the key set
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if ( ! refContigOrdering.containsKey(that.getContig()) )
|
||||||
|
return -1;
|
||||||
|
else {
|
||||||
|
assert refContigOrdering.containsKey(this.getContig()) : this;
|
||||||
|
assert refContigOrdering.containsKey(that.getContig()) : that;
|
||||||
|
|
||||||
|
final int thisO = refContigOrdering.get(this.getContig());
|
||||||
|
final int thatO = refContigOrdering.get(that.getContig());
|
||||||
|
if ( thisO < thatO ) return -1;
|
||||||
|
if ( thisO > thatO ) return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
int cmpContig = getContig().compareTo(that.getContig());
|
||||||
|
if ( cmpContig != 0 )return cmpContig;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( this.getStart() < that.getStart() ) return -1;
|
||||||
|
if ( this.getStart() > that.getStart() ) return 1;
|
||||||
|
if ( this.getStop() < that.getStop() ) return -1;
|
||||||
|
if ( this.getStop() > that.getStop() ) return 1;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -72,31 +72,41 @@ public class rodDbSNP extends ReferenceOrderedDatum {
|
||||||
return String.format("%s:%s", name, observed);
|
return String.format("%s:%s", name, observed);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String repl() {
|
||||||
|
return String.format("%d\t%s\t%d\t%d\t%s\t0\t%s\tX\tX\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d",
|
||||||
|
585, contig, start-1, stop-1, name, strand, observed, molType,
|
||||||
|
varType, validationStatus, avHet, avHetSE, func, locType, weight );
|
||||||
|
}
|
||||||
|
|
||||||
public void parseLine(final String[] parts) {
|
public void parseLine(final String[] parts) {
|
||||||
//System.out.printf("Parsing GFFLine %s%n", Utils.join(" <=> ", parts));
|
try {
|
||||||
contig = parts[1];
|
contig = parts[1];
|
||||||
start = Long.parseLong(parts[2]) + 1; // The final is 0 based
|
start = Long.parseLong(parts[2]) + 1; // The final is 0 based
|
||||||
stop = Long.parseLong(parts[3]) + 1; // The final is 0 based
|
|
||||||
name = parts[4];
|
|
||||||
strand = parts[5];
|
|
||||||
observed = parts[9];
|
|
||||||
molType = parts[10];
|
|
||||||
varType = parts[11];
|
|
||||||
validationStatus = parts[12];
|
|
||||||
avHet = Double.parseDouble(parts[13]);
|
|
||||||
avHetSE = Double.parseDouble(parts[14]);
|
|
||||||
func = parts[15];
|
|
||||||
locType = parts[16];
|
|
||||||
weight = Integer.parseInt(parts[17]);
|
|
||||||
|
|
||||||
// Cut up the observed bases string into an array of individual bases
|
stop = Long.parseLong(parts[3]) + 1; // The final is 0 based
|
||||||
String[] bases = observed.split("/");
|
name = parts[4];
|
||||||
observedBases = new char[bases.length];
|
strand = parts[6];
|
||||||
for ( String elt : bases ) {
|
observed = parts[9];
|
||||||
observedBases[0] = (char)elt.getBytes()[0];
|
molType = parts[10];
|
||||||
//System.out.printf(" Bases %s %d %c%n", elt, elt.getBytes()[0], (char)elt.getBytes()[0]);
|
varType = parts[11];
|
||||||
|
validationStatus = parts[12];
|
||||||
|
avHet = Double.parseDouble(parts[13]);
|
||||||
|
avHetSE = Double.parseDouble(parts[14]);
|
||||||
|
func = parts[15];
|
||||||
|
locType = parts[16];
|
||||||
|
weight = Integer.parseInt(parts[17]);
|
||||||
|
|
||||||
|
// Cut up the observed bases string into an array of individual bases
|
||||||
|
String[] bases = observed.split("/");
|
||||||
|
observedBases = new char[bases.length];
|
||||||
|
for ( String elt : bases ) {
|
||||||
|
observedBases[0] = (char)elt.getBytes()[0];
|
||||||
|
//System.out.printf(" Bases %s %d %c%n", elt, elt.getBytes()[0], (char)elt.getBytes()[0]);
|
||||||
|
}
|
||||||
|
//System.out.printf(" => Observed bases are %s%n", Utils.join(" B ", bases));
|
||||||
|
} catch ( RuntimeException e ) {
|
||||||
|
System.out.printf(" Exception caught during parsing GFFLine %s%n", Utils.join(" <=> ", parts));
|
||||||
|
throw e;
|
||||||
}
|
}
|
||||||
//System.out.printf(" => Observed bases are %s%n", Utils.join(" B ", bases));
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -98,6 +98,10 @@ public class rodGFF extends ReferenceOrderedDatum {
|
||||||
return String.format("%s\t%s\t%s\t%d\t%d\t%f\t%s\t%s", contig, source, feature, start, stop, score, strand, frame);
|
return String.format("%s\t%s\t%s\t%d\t%d\t%f\t%s\t%s", contig, source, feature, start, stop, score, strand, frame);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String repl() {
|
||||||
|
return this.toString();
|
||||||
|
}
|
||||||
|
|
||||||
public String toSimpleString() {
|
public String toSimpleString() {
|
||||||
return String.format("%s", feature);
|
return String.format("%s", feature);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue