diff --git a/java/src/edu/mit/broad/sting/atk/PrepareROD.java b/java/src/edu/mit/broad/sting/atk/PrepareROD.java new file mode 100644 index 000000000..770aa5ab5 --- /dev/null +++ b/java/src/edu/mit/broad/sting/atk/PrepareROD.java @@ -0,0 +1,92 @@ +package edu.mit.broad.sting.atk; + +import edu.mit.broad.sam.SAMFileReader.ValidationStringency; +import edu.mit.broad.sam.SAMSequenceRecord; +import edu.mit.broad.picard.cmdline.CommandLineProgram; +import edu.mit.broad.picard.cmdline.Usage; +import edu.mit.broad.picard.cmdline.Option; +import edu.mit.broad.picard.reference.ReferenceSequenceFileFactory; +import edu.mit.broad.picard.reference.ReferenceSequence; +import edu.mit.broad.picard.reference.ReferenceSequenceFile; + +import edu.mit.broad.sting.atk.modules.*; +import edu.mit.broad.sting.utils.*; + +import java.io.*; +import java.util.HashMap; +import java.util.List; +import java.util.ArrayList; + +public class PrepareROD extends CommandLineProgram { + // Usage and parameters + @Usage(programVersion="0.1") public String USAGE = "SAM Validator\n"; + @Option(shortName="REF", doc="Reference sequence file") public File REF_FILE_ARG = null; + @Option(shortName="ROD", doc="Referenced Ordered Data file") public String ROD_FILE = null; + @Option(shortName="OUT", doc="Referenced Ordered Data file") public String OUTPUT_FILE = null; + @Option(shortName="RODNAME", doc="Name of the data") public String ROD_NAME = null; + @Option(shortName="RODTYPE", doc="Referenced Ordered Data type") public String ROD_TYPE = null; + + public static HashMap Types = new HashMap(); + public static void addModule(final String name, final Class rodType) { + System.out.printf("* Adding rod class %s%n", name); + Types.put(name.toLowerCase(), rodType); + } + + static { + addModule("GFF", rodGFF.class); + addModule("dbSNP", rodDbSNP.class); + } + + /** Required main method implementation. */ + public static void main(String[] argv) { + System.exit(new PrepareROD().instanceMain(argv)); + } + + protected int doWork() { + + // Prepare the sort ordering w.r.t. the sequence dictionary + final ReferenceSequenceFile refFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(REF_FILE_ARG); + List refContigs = refFile.getSequenceDictionary(); + HashMap refContigOrdering = new HashMap(); + ReferenceOrderedDatum.setContigOrdering(refContigOrdering); + + int i = 0; + for ( SAMSequenceRecord contig : refContigs ) { + System.out.println(contig.getSequenceName()); + refContigOrdering.put(contig.getSequenceName(), i); + i++; + } + + Class rodClass = Types.get(ROD_TYPE.toLowerCase()); + + ReferenceOrderedData rod = new ReferenceOrderedData(new File(ROD_FILE), rodClass ); + try { + rod.validateFile(); + } catch ( Exception e ) { + //System.out.println("Validation failure: " + e); + e.printStackTrace(); + } + + ArrayList rodData = rod.readAll(); + System.out.printf("Read %d elements from %s%n", rodData.size(), ROD_FILE); + ReferenceOrderedData.sortRODDataInMemory(rodData); + try { + ReferenceOrderedData.write(rodData, new File(OUTPUT_FILE)); + } catch ( IOException e ) { + //System.out.println("Validation failure: " + e); + e.printStackTrace(); + } + + System.out.printf("Validating output file %s%n", rodData.size(), OUTPUT_FILE); + ReferenceOrderedData outputRod = new ReferenceOrderedData(new File(OUTPUT_FILE), rodClass ); + try { + outputRod.validateFile(); + //outputRod.hasSameContents(ROD_FILE); + } catch ( Exception e ) { + //System.out.println("Validation failure: " + e); + e.printStackTrace(); + } + + return 0; + } +} \ No newline at end of file diff --git a/java/src/edu/mit/broad/sting/utils/ReferenceOrderedData.java b/java/src/edu/mit/broad/sting/utils/ReferenceOrderedData.java index e71d0bd44..7256acd83 100644 --- a/java/src/edu/mit/broad/sting/utils/ReferenceOrderedData.java +++ b/java/src/edu/mit/broad/sting/utils/ReferenceOrderedData.java @@ -1,7 +1,13 @@ package edu.mit.broad.sting.utils; import java.io.File; +import java.io.FileOutputStream; +import java.io.FileWriter; +import java.io.IOException; import java.util.Iterator; +import java.util.ArrayList; +import java.util.Collections; + import edu.mit.broad.picard.util.TabbedTextFileParser; /** @@ -41,6 +47,50 @@ public class ReferenceOrderedData implements System.exit(1); } + // ---------------------------------------------------------------------- + // + // Manipulations of all of the data + // + // ---------------------------------------------------------------------- + public ArrayList readAll() { + ArrayList elts = new ArrayList(); + for ( ReferenceOrderedDatum rec : this ) { + elts.add(rec); + } + elts.trimToSize(); + return elts; + } + + public static void sortRODDataInMemory(ArrayList data) { + Collections.sort(data); + } + + public static void write(ArrayList data, File output) throws IOException { + final FileWriter out = new FileWriter(output); + + for ( ReferenceOrderedDatum rec : data ) { + out.write(rec.repl() + "\n"); + } + + out.close(); + } + + public boolean validateFile() throws Exception { + ReferenceOrderedDatum last = null; + for ( ReferenceOrderedDatum rec : this ) { + if ( last != null && last.compareTo(rec) == 1 ) { + // It's out of order + throw new Exception("Out of order elements at \n" + last.toString() + "\n" + rec.toString()); + } + last = rec; + } + return true; + } + + public void indexFile() { + // Fixme -- get access to the linear index system from Jim + } + // ---------------------------------------------------------------------- // // Iteration diff --git a/java/src/edu/mit/broad/sting/utils/ReferenceOrderedDatum.java b/java/src/edu/mit/broad/sting/utils/ReferenceOrderedDatum.java index ca1c88aaf..ae5779022 100644 --- a/java/src/edu/mit/broad/sting/utils/ReferenceOrderedDatum.java +++ b/java/src/edu/mit/broad/sting/utils/ReferenceOrderedDatum.java @@ -1,5 +1,11 @@ package edu.mit.broad.sting.utils; +import java.util.Comparator; +import java.util.HashMap; + +// +// Ugly global variable defining the optional ordering of contig elements +// /** * Created by IntelliJ IDEA. * User: mdepristo @@ -7,15 +13,62 @@ package edu.mit.broad.sting.utils; * Time: 10:49:47 AM * To change this template use File | Settings | File Templates. */ -public abstract class ReferenceOrderedDatum { +public abstract class ReferenceOrderedDatum implements Comparable { + public static HashMap refContigOrdering = null; + public static void setContigOrdering(HashMap rco) { + refContigOrdering = rco; + } + public ReferenceOrderedDatum() { } public abstract void parseLine(final String[] parts); public abstract String toString(); public abstract String toSimpleString(); + public abstract String repl(); public abstract String getContig(); public abstract long getStart(); public abstract long getStop(); + + public int compareTo( Object x ) { + if ( this == x ) return 0; + + ReferenceOrderedDatum that = (ReferenceOrderedDatum)x; + + if ( refContigOrdering != null ) { + if ( ! refContigOrdering.containsKey(this.getContig()) ) { + if ( ! refContigOrdering.containsKey(that.getContig()) ) { + // Use regular sorted order + int cmpContig = getContig().compareTo(that.getContig()); + if ( cmpContig != 0 )return cmpContig; + } + else { + // this is always bigger if that is in the key set + return 1; + } + } + else if ( ! refContigOrdering.containsKey(that.getContig()) ) + return -1; + else { + assert refContigOrdering.containsKey(this.getContig()) : this; + assert refContigOrdering.containsKey(that.getContig()) : that; + + final int thisO = refContigOrdering.get(this.getContig()); + final int thatO = refContigOrdering.get(that.getContig()); + if ( thisO < thatO ) return -1; + if ( thisO > thatO ) return 1; + } + } + else { + int cmpContig = getContig().compareTo(that.getContig()); + if ( cmpContig != 0 )return cmpContig; + } + + if ( this.getStart() < that.getStart() ) return -1; + if ( this.getStart() > that.getStart() ) return 1; + if ( this.getStop() < that.getStop() ) return -1; + if ( this.getStop() > that.getStop() ) return 1; + return 0; + } } diff --git a/java/src/edu/mit/broad/sting/utils/rodDbSNP.java b/java/src/edu/mit/broad/sting/utils/rodDbSNP.java index 695ec9c9d..eb29a36ca 100644 --- a/java/src/edu/mit/broad/sting/utils/rodDbSNP.java +++ b/java/src/edu/mit/broad/sting/utils/rodDbSNP.java @@ -72,31 +72,41 @@ public class rodDbSNP extends ReferenceOrderedDatum { return String.format("%s:%s", name, observed); } + public String repl() { + return String.format("%d\t%s\t%d\t%d\t%s\t0\t%s\tX\tX\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d", + 585, contig, start-1, stop-1, name, strand, observed, molType, + varType, validationStatus, avHet, avHetSE, func, locType, weight ); + } + public void parseLine(final String[] parts) { - //System.out.printf("Parsing GFFLine %s%n", Utils.join(" <=> ", parts)); - contig = parts[1]; - start = Long.parseLong(parts[2]) + 1; // The final is 0 based - stop = Long.parseLong(parts[3]) + 1; // The final is 0 based - name = parts[4]; - strand = parts[5]; - observed = parts[9]; - molType = parts[10]; - varType = parts[11]; - validationStatus = parts[12]; - avHet = Double.parseDouble(parts[13]); - avHetSE = Double.parseDouble(parts[14]); - func = parts[15]; - locType = parts[16]; - weight = Integer.parseInt(parts[17]); + try { + contig = parts[1]; + start = Long.parseLong(parts[2]) + 1; // The final is 0 based - // Cut up the observed bases string into an array of individual bases - String[] bases = observed.split("/"); - observedBases = new char[bases.length]; - for ( String elt : bases ) { - observedBases[0] = (char)elt.getBytes()[0]; - //System.out.printf(" Bases %s %d %c%n", elt, elt.getBytes()[0], (char)elt.getBytes()[0]); + stop = Long.parseLong(parts[3]) + 1; // The final is 0 based + name = parts[4]; + strand = parts[6]; + observed = parts[9]; + molType = parts[10]; + varType = parts[11]; + validationStatus = parts[12]; + avHet = Double.parseDouble(parts[13]); + avHetSE = Double.parseDouble(parts[14]); + func = parts[15]; + locType = parts[16]; + weight = Integer.parseInt(parts[17]); + + // Cut up the observed bases string into an array of individual bases + String[] bases = observed.split("/"); + observedBases = new char[bases.length]; + for ( String elt : bases ) { + observedBases[0] = (char)elt.getBytes()[0]; + //System.out.printf(" Bases %s %d %c%n", elt, elt.getBytes()[0], (char)elt.getBytes()[0]); + } + //System.out.printf(" => Observed bases are %s%n", Utils.join(" B ", bases)); + } catch ( RuntimeException e ) { + System.out.printf(" Exception caught during parsing GFFLine %s%n", Utils.join(" <=> ", parts)); + throw e; } - //System.out.printf(" => Observed bases are %s%n", Utils.join(" B ", bases)); - } } \ No newline at end of file diff --git a/java/src/edu/mit/broad/sting/utils/rodGFF.java b/java/src/edu/mit/broad/sting/utils/rodGFF.java index 86fdfa4fa..ea94f7fa8 100644 --- a/java/src/edu/mit/broad/sting/utils/rodGFF.java +++ b/java/src/edu/mit/broad/sting/utils/rodGFF.java @@ -98,6 +98,10 @@ public class rodGFF extends ReferenceOrderedDatum { return String.format("%s\t%s\t%s\t%d\t%d\t%f\t%s\t%s", contig, source, feature, start, stop, score, strand, frame); } + public String repl() { + return this.toString(); + } + public String toSimpleString() { return String.format("%s", feature); }