Continuing to expand support for ROD. These edits add a program, PrepareROD, that sorts and prepares RODs for consumption by the analysis TK.
git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@17 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
parent
fd447d7c1d
commit
b19e4b502b
|
|
@ -0,0 +1,92 @@
|
|||
package edu.mit.broad.sting.atk;
|
||||
|
||||
import edu.mit.broad.sam.SAMFileReader.ValidationStringency;
|
||||
import edu.mit.broad.sam.SAMSequenceRecord;
|
||||
import edu.mit.broad.picard.cmdline.CommandLineProgram;
|
||||
import edu.mit.broad.picard.cmdline.Usage;
|
||||
import edu.mit.broad.picard.cmdline.Option;
|
||||
import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory;
|
||||
import edu.mit.broad.picard.reference.ReferenceSequence;
|
||||
import edu.mit.broad.picard.reference.ReferenceSequenceFile;
|
||||
|
||||
import edu.mit.broad.sting.atk.modules.*;
|
||||
import edu.mit.broad.sting.utils.*;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
|
||||
public class PrepareROD extends CommandLineProgram {
|
||||
// Usage and parameters
|
||||
@Usage(programVersion="0.1") public String USAGE = "SAM Validator\n";
|
||||
@Option(shortName="REF", doc="Reference sequence file") public File REF_FILE_ARG = null;
|
||||
@Option(shortName="ROD", doc="Referenced Ordered Data file") public String ROD_FILE = null;
|
||||
@Option(shortName="OUT", doc="Referenced Ordered Data file") public String OUTPUT_FILE = null;
|
||||
@Option(shortName="RODNAME", doc="Name of the data") public String ROD_NAME = null;
|
||||
@Option(shortName="RODTYPE", doc="Referenced Ordered Data type") public String ROD_TYPE = null;
|
||||
|
||||
public static HashMap<String, Class> Types = new HashMap<String,Class>();
|
||||
public static void addModule(final String name, final Class rodType) {
|
||||
System.out.printf("* Adding rod class %s%n", name);
|
||||
Types.put(name.toLowerCase(), rodType);
|
||||
}
|
||||
|
||||
static {
|
||||
addModule("GFF", rodGFF.class);
|
||||
addModule("dbSNP", rodDbSNP.class);
|
||||
}
|
||||
|
||||
/** Required main method implementation. */
|
||||
public static void main(String[] argv) {
|
||||
System.exit(new PrepareROD().instanceMain(argv));
|
||||
}
|
||||
|
||||
protected int doWork() {
|
||||
|
||||
// Prepare the sort ordering w.r.t. the sequence dictionary
|
||||
final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REF_FILE_ARG);
|
||||
List<SAMSequenceRecord> refContigs = refFile.getSequenceDictionary();
|
||||
HashMap<String, Integer> refContigOrdering = new HashMap<String, Integer>();
|
||||
ReferenceOrderedDatum.setContigOrdering(refContigOrdering);
|
||||
|
||||
int i = 0;
|
||||
for ( SAMSequenceRecord contig : refContigs ) {
|
||||
System.out.println(contig.getSequenceName());
|
||||
refContigOrdering.put(contig.getSequenceName(), i);
|
||||
i++;
|
||||
}
|
||||
|
||||
Class rodClass = Types.get(ROD_TYPE.toLowerCase());
|
||||
|
||||
ReferenceOrderedData rod = new ReferenceOrderedData(new File(ROD_FILE), rodClass );
|
||||
try {
|
||||
rod.validateFile();
|
||||
} catch ( Exception e ) {
|
||||
//System.out.println("Validation failure: " + e);
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
ArrayList<ReferenceOrderedDatum> rodData = rod.readAll();
|
||||
System.out.printf("Read %d elements from %s%n", rodData.size(), ROD_FILE);
|
||||
ReferenceOrderedData.sortRODDataInMemory(rodData);
|
||||
try {
|
||||
ReferenceOrderedData.write(rodData, new File(OUTPUT_FILE));
|
||||
} catch ( IOException e ) {
|
||||
//System.out.println("Validation failure: " + e);
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
System.out.printf("Validating output file %s%n", rodData.size(), OUTPUT_FILE);
|
||||
ReferenceOrderedData outputRod = new ReferenceOrderedData(new File(OUTPUT_FILE), rodClass );
|
||||
try {
|
||||
outputRod.validateFile();
|
||||
//outputRod.hasSameContents(ROD_FILE);
|
||||
} catch ( Exception e ) {
|
||||
//System.out.println("Validation failure: " + e);
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
|
@ -1,7 +1,13 @@
|
|||
package edu.mit.broad.sting.utils;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
|
||||
import edu.mit.broad.picard.util.TabbedTextFileParser;
|
||||
|
||||
/**
|
||||
|
|
@ -41,6 +47,50 @@ public class ReferenceOrderedData<ROD extends ReferenceOrderedDatum> implements
|
|||
System.exit(1);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Manipulations of all of the data
|
||||
//
|
||||
// ----------------------------------------------------------------------
|
||||
public ArrayList<ReferenceOrderedDatum> readAll() {
|
||||
ArrayList<ReferenceOrderedDatum> elts = new ArrayList<ReferenceOrderedDatum>();
|
||||
for ( ReferenceOrderedDatum rec : this ) {
|
||||
elts.add(rec);
|
||||
}
|
||||
elts.trimToSize();
|
||||
return elts;
|
||||
}
|
||||
|
||||
public static void sortRODDataInMemory(ArrayList<ReferenceOrderedDatum> data) {
|
||||
Collections.sort(data);
|
||||
}
|
||||
|
||||
public static void write(ArrayList<ReferenceOrderedDatum> data, File output) throws IOException {
|
||||
final FileWriter out = new FileWriter(output);
|
||||
|
||||
for ( ReferenceOrderedDatum rec : data ) {
|
||||
out.write(rec.repl() + "\n");
|
||||
}
|
||||
|
||||
out.close();
|
||||
}
|
||||
|
||||
public boolean validateFile() throws Exception {
|
||||
ReferenceOrderedDatum last = null;
|
||||
for ( ReferenceOrderedDatum rec : this ) {
|
||||
if ( last != null && last.compareTo(rec) == 1 ) {
|
||||
// It's out of order
|
||||
throw new Exception("Out of order elements at \n" + last.toString() + "\n" + rec.toString());
|
||||
}
|
||||
last = rec;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public void indexFile() {
|
||||
// Fixme -- get access to the linear index system from Jim
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
//
|
||||
// Iteration
|
||||
|
|
|
|||
|
|
@ -1,5 +1,11 @@
|
|||
package edu.mit.broad.sting.utils;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
|
||||
//
|
||||
// Ugly global variable defining the optional ordering of contig elements
|
||||
//
|
||||
/**
|
||||
* Created by IntelliJ IDEA.
|
||||
* User: mdepristo
|
||||
|
|
@ -7,15 +13,62 @@ package edu.mit.broad.sting.utils;
|
|||
* Time: 10:49:47 AM
|
||||
* To change this template use File | Settings | File Templates.
|
||||
*/
|
||||
public abstract class ReferenceOrderedDatum {
|
||||
public abstract class ReferenceOrderedDatum implements Comparable {
|
||||
public static HashMap<String, Integer> refContigOrdering = null;
|
||||
public static void setContigOrdering(HashMap<String, Integer> rco) {
|
||||
refContigOrdering = rco;
|
||||
}
|
||||
|
||||
public ReferenceOrderedDatum() { }
|
||||
|
||||
public abstract void parseLine(final String[] parts);
|
||||
|
||||
public abstract String toString();
|
||||
public abstract String toSimpleString();
|
||||
public abstract String repl();
|
||||
|
||||
public abstract String getContig();
|
||||
public abstract long getStart();
|
||||
public abstract long getStop();
|
||||
|
||||
public int compareTo( Object x ) {
|
||||
if ( this == x ) return 0;
|
||||
|
||||
ReferenceOrderedDatum that = (ReferenceOrderedDatum)x;
|
||||
|
||||
if ( refContigOrdering != null ) {
|
||||
if ( ! refContigOrdering.containsKey(this.getContig()) ) {
|
||||
if ( ! refContigOrdering.containsKey(that.getContig()) ) {
|
||||
// Use regular sorted order
|
||||
int cmpContig = getContig().compareTo(that.getContig());
|
||||
if ( cmpContig != 0 )return cmpContig;
|
||||
}
|
||||
else {
|
||||
// this is always bigger if that is in the key set
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
else if ( ! refContigOrdering.containsKey(that.getContig()) )
|
||||
return -1;
|
||||
else {
|
||||
assert refContigOrdering.containsKey(this.getContig()) : this;
|
||||
assert refContigOrdering.containsKey(that.getContig()) : that;
|
||||
|
||||
final int thisO = refContigOrdering.get(this.getContig());
|
||||
final int thatO = refContigOrdering.get(that.getContig());
|
||||
if ( thisO < thatO ) return -1;
|
||||
if ( thisO > thatO ) return 1;
|
||||
}
|
||||
}
|
||||
else {
|
||||
int cmpContig = getContig().compareTo(that.getContig());
|
||||
if ( cmpContig != 0 )return cmpContig;
|
||||
}
|
||||
|
||||
if ( this.getStart() < that.getStart() ) return -1;
|
||||
if ( this.getStart() > that.getStart() ) return 1;
|
||||
if ( this.getStop() < that.getStop() ) return -1;
|
||||
if ( this.getStop() > that.getStop() ) return 1;
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -72,31 +72,41 @@ public class rodDbSNP extends ReferenceOrderedDatum {
|
|||
return String.format("%s:%s", name, observed);
|
||||
}
|
||||
|
||||
public String repl() {
|
||||
return String.format("%d\t%s\t%d\t%d\t%s\t0\t%s\tX\tX\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d",
|
||||
585, contig, start-1, stop-1, name, strand, observed, molType,
|
||||
varType, validationStatus, avHet, avHetSE, func, locType, weight );
|
||||
}
|
||||
|
||||
public void parseLine(final String[] parts) {
|
||||
//System.out.printf("Parsing GFFLine %s%n", Utils.join(" <=> ", parts));
|
||||
contig = parts[1];
|
||||
start = Long.parseLong(parts[2]) + 1; // The final is 0 based
|
||||
stop = Long.parseLong(parts[3]) + 1; // The final is 0 based
|
||||
name = parts[4];
|
||||
strand = parts[5];
|
||||
observed = parts[9];
|
||||
molType = parts[10];
|
||||
varType = parts[11];
|
||||
validationStatus = parts[12];
|
||||
avHet = Double.parseDouble(parts[13]);
|
||||
avHetSE = Double.parseDouble(parts[14]);
|
||||
func = parts[15];
|
||||
locType = parts[16];
|
||||
weight = Integer.parseInt(parts[17]);
|
||||
try {
|
||||
contig = parts[1];
|
||||
start = Long.parseLong(parts[2]) + 1; // The final is 0 based
|
||||
|
||||
// Cut up the observed bases string into an array of individual bases
|
||||
String[] bases = observed.split("/");
|
||||
observedBases = new char[bases.length];
|
||||
for ( String elt : bases ) {
|
||||
observedBases[0] = (char)elt.getBytes()[0];
|
||||
//System.out.printf(" Bases %s %d %c%n", elt, elt.getBytes()[0], (char)elt.getBytes()[0]);
|
||||
stop = Long.parseLong(parts[3]) + 1; // The final is 0 based
|
||||
name = parts[4];
|
||||
strand = parts[6];
|
||||
observed = parts[9];
|
||||
molType = parts[10];
|
||||
varType = parts[11];
|
||||
validationStatus = parts[12];
|
||||
avHet = Double.parseDouble(parts[13]);
|
||||
avHetSE = Double.parseDouble(parts[14]);
|
||||
func = parts[15];
|
||||
locType = parts[16];
|
||||
weight = Integer.parseInt(parts[17]);
|
||||
|
||||
// Cut up the observed bases string into an array of individual bases
|
||||
String[] bases = observed.split("/");
|
||||
observedBases = new char[bases.length];
|
||||
for ( String elt : bases ) {
|
||||
observedBases[0] = (char)elt.getBytes()[0];
|
||||
//System.out.printf(" Bases %s %d %c%n", elt, elt.getBytes()[0], (char)elt.getBytes()[0]);
|
||||
}
|
||||
//System.out.printf(" => Observed bases are %s%n", Utils.join(" B ", bases));
|
||||
} catch ( RuntimeException e ) {
|
||||
System.out.printf(" Exception caught during parsing GFFLine %s%n", Utils.join(" <=> ", parts));
|
||||
throw e;
|
||||
}
|
||||
//System.out.printf(" => Observed bases are %s%n", Utils.join(" B ", bases));
|
||||
|
||||
}
|
||||
}
|
||||
|
|
@ -98,6 +98,10 @@ public class rodGFF extends ReferenceOrderedDatum {
|
|||
return String.format("%s\t%s\t%s\t%d\t%d\t%f\t%s\t%s", contig, source, feature, start, stop, score, strand, frame);
|
||||
}
|
||||
|
||||
public String repl() {
|
||||
return this.toString();
|
||||
}
|
||||
|
||||
public String toSimpleString() {
|
||||
return String.format("%s", feature);
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue