diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/BasicReferenceOrderedDatum.java b/java/src/org/broadinstitute/sting/gatk/refdata/BasicReferenceOrderedDatum.java new file mode 100755 index 000000000..44e3d5c2e --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/BasicReferenceOrderedDatum.java @@ -0,0 +1,45 @@ +package org.broadinstitute.sting.gatk.refdata; + +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; + +/** + * Created by IntelliJ IDEA. + * User: mdepristo + * Date: Feb 27, 2009 + * Time: 10:49:47 AM + * To change this template use File | Settings | File Templates. + */ +public abstract class BasicReferenceOrderedDatum implements ReferenceOrderedDatum { + protected String name; + + public BasicReferenceOrderedDatum(String name) { + this.name = name; + } + + public String getName() { return this.name; } + + public abstract boolean parseLine(final Object header, final String[] parts) throws IOException; + + public abstract String toString(); + public String toSimpleString() { return toString(); } + public String repl() { return this.toString(); } + + public String delimiter() { + return "\t"; + } + + public abstract GenomeLoc getLocation(); + + public int compareTo( ReferenceOrderedDatum that ) { + return getLocation().compareTo(that.getLocation()); + } + + public Object initialize(final File source) throws FileNotFoundException { + //System.out.printf("Initialize called with %s%n", source); + return null; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/HapMapAlleleFrequenciesROD.java b/java/src/org/broadinstitute/sting/gatk/refdata/HapMapAlleleFrequenciesROD.java index be677e9d7..443b90b7e 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/HapMapAlleleFrequenciesROD.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/HapMapAlleleFrequenciesROD.java @@ -11,7 +11,7 @@ import edu.mit.broad.picard.util.SequenceUtil; /** * ReferenceOrderedDatum class to hold HapMap AlleleFrequency Data */ -public class HapMapAlleleFrequenciesROD extends ReferenceOrderedDatum { +public class HapMapAlleleFrequenciesROD extends BasicReferenceOrderedDatum { public GenomeLoc loc; // genome location of SNP // Reference sequence chromosome or scaffold // Start and stop positions in chrom @@ -46,7 +46,7 @@ public class HapMapAlleleFrequenciesROD extends ReferenceOrderedDatum { return String.format( "%s\t%s\t%s\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%1.3f\t%1.3f\t%d", - rsNumber, hgBuild, getContig(), getStart(), strand, alleles, refAllele, varAllele, + rsNumber, hgBuild, getLocation().getContig(), getLocation().getStart(), strand, alleles, refAllele, varAllele, refCounts, varCounts, refFreq, varFreq, totalCounts); } @@ -58,7 +58,7 @@ public class HapMapAlleleFrequenciesROD extends ReferenceOrderedDatum { return toString(); } - public void parseLine(final String[] parts) { + public boolean parseLine(final Object header, final String[] parts) { try { // rs11511647 <=> HG18 <=> chr10 <=> 62765 <=> + <=> T/C <=> T <=> C <=> 21 <=> 97 <=> 0.178 <=> 0.822 <=> 118 @@ -85,6 +85,7 @@ public class HapMapAlleleFrequenciesROD extends ReferenceOrderedDatum { System.out.printf(" Exception caught during parsing HapMap Allele Freq %s%n", Utils.join(" <=> ", parts)); throw e; } + return true; } public double getVarAlleleFreq() { return this.varFreq; } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java index b629e6911..3602c1834 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java @@ -25,6 +25,13 @@ import org.apache.log4j.Logger; public class ReferenceOrderedData implements Iterable { private String name; private File file = null; + private String fieldDelimiter; + + /** + * Header object returned from the datum + */ + private Object header = null; + private Class type = null; // runtime type information for object construction // ---------------------------------------------------------------------- @@ -53,6 +60,7 @@ public class ReferenceOrderedData implements addModule("dbSNP", rodDbSNP.class); addModule("HapMapAlleleFrequencies", HapMapAlleleFrequenciesROD.class); addModule("SAMPileup", rodSAMPileup.class); + addModule("Table", TabularROD.class); } @@ -124,6 +132,8 @@ public class ReferenceOrderedData implements this.file = file; this.type = type; this.name = name; + this.header = initializeROD(name, file, type); + this.fieldDelimiter = newROD(name, type).delimiter(); } public String getName() { return name; } @@ -218,8 +228,9 @@ public class ReferenceOrderedData implements public ROD next() { final String line = parser.next(); //System.out.printf("Line is %s%n", line); - String parts[] = line.split("\t"); - return parseLine(parts); + String parts[] = line.split(fieldDelimiter); + ROD n = parseLine(parts); + return n != null ? n : next(); } public void remove() { @@ -263,52 +274,41 @@ public class ReferenceOrderedData implements return prev; } - // - // Seeks forward in the file until we reach (or cross) a record at contig / pos - // If we don't find anything and cross beyond contig / pos, we return null - // Otherwise we return the first object who's start is at pos - // + /** + * Seeks forward in the file until we reach (or cross) a record at contig / pos + * If we don't find anything and cross beyond contig / pos, we return null + * Otherwise we return the first object who's start is at pos + * + * @param loc + * @return + */ public ROD seekForward(final GenomeLoc loc) { - return seekForward(loc.getContig(), loc.getStart()); - } - - protected ROD seekForward(final String contigName, final long pos) { - final boolean DEBUG = false; + final boolean DEBUG = false; ROD result = null; - if ( DEBUG ) System.out.printf(" *** starting seek to %s %d %s%n", contigName, pos, prev); + if ( DEBUG ) System.out.printf(" *** starting seek to %s %d %s%n", loc.getContig(), loc.getStart(), prev); while ( hasNext() ) { ROD current = next(); //System.out.printf(" -> Seeking to %s %d AT %s %d%n", contigName, pos, current.getContig(), current.getStart()); - int strCmp = GenomeLoc.compareContigs( contigName, prev.getContig() );// contigName.compareTo( prev.getContig() ); - if ( strCmp == 0 ) { - // The contigs are equal - if ( current.getStart() > pos ) { - // There was nothing to find, push back next and return null - it.pushback(current); - break; - } - else if ( pos == current.getStart() ) { - // We found a record at contig / pos, return it - result = current; - break; - } + int cmp = current.getLocation().compareTo(loc); + if ( cmp < 0 ) { + // current occurs before loc, continue searching + continue; } - else if ( strCmp < 0 ) { - if ( DEBUG ) System.out.printf(" -> Jumped to contig %s%n", contigName); - // We've gone past the desired contig, break + else if ( cmp == 0 ) { + result = current; + break; + } else { + // current is after loc it.pushback(current); break; } } if ( DEBUG ) { - if ( result == null ) - ; - //System.out.printf(" --- seek result to %s %d is NULL%n", contigName, pos); - else - System.out.printf(" ### Found %s %d%n", result.getContig(), result.getStart()); + if ( result != null ) + System.out.printf(" ### Found %s%n", result.getLocation()); } @@ -326,26 +326,39 @@ public class ReferenceOrderedData implements // Parsing // // ---------------------------------------------------------------------- - ROD parseLine(final String[] parts) { - //System.out.printf("Parsing GFFLine %s%n", Utils.join(" ", parts)); + private ROD newROD( final String name, final Class type ) { try { - //ROD obj = type.newInstance(); Constructor c = type.getConstructor(String.class); - ROD obj = (ROD)c.newInstance(name); - obj.parseLine(parts); - return obj; + return (ROD)c.newInstance(name); } catch ( java.lang.InstantiationException e ) { - System.out.println(e); - return null; // wow, unsafe! + throw new RuntimeException(e); } catch ( java.lang.IllegalAccessException e ) { - System.out.println(e); - return null; // wow, unsafe! + throw new RuntimeException(e); } catch ( java.lang.NoSuchMethodException e ) { - System.out.println(e); - return null; // wow, unsafe! + throw new RuntimeException(e); } catch ( InvocationTargetException e ) { - System.out.println(e); - return null; // wow, unsafe! + throw new RuntimeException(e); } } + + private Object initializeROD(final String name, final File file, final Class type) { + ROD rod = newROD(name, type); + try { + return rod.initialize(file); + } catch ( FileNotFoundException e ) { + throw new RuntimeException(e); + } + } + + private ROD parseLine(final String[] parts) { + //System.out.printf("Parsing GFFLine %s%n", Utils.join(" ", parts)); + ROD obj = newROD(name, type); + try { + if ( ! obj.parseLine(header, parts) ) + obj = null; + } catch (IOException e) { + throw new RuntimeException("Badly formed ROD: " + e); + } + return obj; + } } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java index 9c34b4ab9..8061a7a27 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java @@ -2,6 +2,10 @@ package org.broadinstitute.sting.gatk.refdata; import org.broadinstitute.sting.utils.GenomeLoc; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; + /** * Created by IntelliJ IDEA. * User: mdepristo @@ -9,27 +13,28 @@ import org.broadinstitute.sting.utils.GenomeLoc; * Time: 10:49:47 AM * To change this template use File | Settings | File Templates. */ -public abstract class ReferenceOrderedDatum implements Comparable { - protected String name; +public interface ReferenceOrderedDatum extends Comparable { + public String getName(); + public boolean parseLine(final Object header, final String[] parts) throws IOException; + public String toString(); + public String toSimpleString(); + public String repl(); - public ReferenceOrderedDatum(String name) { - this.name = name; - } + /** + * Used by the ROD system to determine how to split input lines + * @return Regex string delimiter separating fields + */ + public String delimiter(); - public String getName() { return this.name; } + public GenomeLoc getLocation(); + public int compareTo( ReferenceOrderedDatum that ); - public abstract void parseLine(final String[] parts); - - public abstract String toString(); - public abstract String toSimpleString(); - public abstract String repl(); - - public abstract GenomeLoc getLocation(); - public int compareTo( ReferenceOrderedDatum that ) { - return getLocation().compareTo(that.getLocation()); - } - - public final String getContig() { return getLocation().getContig(); } - public final long getStart() { return getLocation().getStart(); } - public final long getStop() { return getLocation().getStop(); } + /** + * Backdoor hook to read header, meta-data, etc. associated with the file. Will be + * called by the ROD system before streaming starts + * + * @param source source data file on disk from which this rod stream will be pulled + * @return a header object that will be passed to parseLine command + */ + public Object initialize(final File source) throws FileNotFoundException; } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/TabularROD.java b/java/src/org/broadinstitute/sting/gatk/refdata/TabularROD.java new file mode 100755 index 000000000..3441fafe7 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/TabularROD.java @@ -0,0 +1,171 @@ +package org.broadinstitute.sting.gatk.refdata; + +import java.util.*; +import java.util.regex.MatchResult; +import java.util.regex.Pattern; +import java.util.regex.Matcher; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; + +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.xReadLines; +import org.apache.log4j.Logger; + +/** + * Class for representing arbitrary reference ordered data sets + * + * User: mdepristo + * Date: Feb 27, 2009 + * Time: 10:47:14 AM + * To change this template use File | Settings | File Templates. + */ +public class TabularROD extends BasicReferenceOrderedDatum implements Map { + private static Logger logger = Logger.getLogger(TabularROD.class); + + private GenomeLoc loc; + private HashMap attributes; + private ArrayList header; + + // ---------------------------------------------------------------------- + // + // Constructors + // + // ---------------------------------------------------------------------- + public TabularROD(final String name) { + super(name); + attributes = new HashMap(); + } + + /** + * Walks through the source files looking for the header line, which it returns as a + * list of strings. + * + * @param source + * @return + */ + public Object initialize(final File source) throws FileNotFoundException { + List header = null; + int linesLookedAt = 0; + xReadLines reader = new xReadLines(source); + + for ( String line : reader ) { + Matcher m = HEADER_PATTERN.matcher(line); + if ( m.matches() ) { + //System.out.printf("Found a header line: %s%n", line); + header = new ArrayList(Arrays.asList(line.split("\\s+"))); + } + + if ( linesLookedAt++ > MAX_LINES_TO_LOOK_FOR_HEADER ) + break; + } + + // check that we found the header + if ( header != null ) { + logger.debug(String.format("HEADER IS %s%n", Utils.join(":", header))); + } else { + throw new RuntimeException("Couldn't find header line in TabularROD!"); + } + + //System.exit(1); + return header; + } + + private static int MAX_LINES_TO_LOOK_FOR_HEADER = 1000; + private static Pattern HEADER_PATTERN = Pattern.compile("^\\s*HEADER.*"); + private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); + + // ---------------------------------------------------------------------- + // + // Accessors + // + // ---------------------------------------------------------------------- + public GenomeLoc getLocation() { + return loc; + } + + public String get(final Object key) { + return attributes.get(key); + } + + public String put(final String key, final String object) { + return attributes.put(key, object); + } + + public boolean containsKey(final Object key) { + return attributes.containsKey(key); + } + + public HashMap getAttributes() { + return attributes; + } + + public String getAttributeString() { + List strings = new ArrayList(attributes.size()); + for ( String key : header ) { + if ( containsKey(key) ) { // avoid the header + strings.add(this.get(key)); + System.out.printf("Adding %s%n", this.get(key)); + } + } + return Utils.join("\t", strings); + } + + // ---------------------------------------------------------------------- + // + // map functions + // + // ---------------------------------------------------------------------- + public int size() { return attributes.size(); } + public boolean isEmpty() { return attributes.isEmpty(); } + public boolean containsValue(Object o) { return attributes.containsValue(o); } + public String remove(Object o) { return attributes.remove(o); } + public void clear() { attributes.clear(); } + public java.util.Set keySet() { return attributes.keySet(); } + public java.util.Collection values() { return attributes.values(); } + + public void putAll(java.util.Map map) { + attributes.putAll(map); + } + + public java.util.Set> entrySet() { + return attributes.entrySet(); + } + + // ---------------------------------------------------------------------- + // + // formatting + // + // ---------------------------------------------------------------------- + public String toString() { + return String.format("%s\t%s", getLocation(), getAttributeString()); + } + + public String delimiter() { + return "\\s+"; + } + + public boolean parseLine(final Object headerObj, final String[] parts) throws IOException { + header = (ArrayList)(headerObj); + + //System.out.printf("parts [len=%d] is '%s'%n", parts.length, Utils.join(":", parts)); + + if ( parts.length == 0 || COMMENT_PATTERN.matcher(parts[0]).matches() || HEADER_PATTERN.matcher(parts[0]).matches() ) + return false; + + if (header.size() != parts.length) { + throw new IOException(String.format("Header length %d not equal to Tabular parts length %d", header.size(), parts.length)); + } + + loc = GenomeLoc.parseGenomeLoc(parts[0]); + for ( int i = 1; i < parts.length; i++ ) { + put(header.get(i), parts[i]); + } + + //System.out.printf("Parsed %s%n", this); + + return true; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java b/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java index 0cb0093db..7c9753a78 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java @@ -19,7 +19,7 @@ import org.broadinstitute.sting.gatk.refdata.AllelicVariant; * Time: 10:47:14 AM * To change this template use File | Settings | File Templates. */ -public class rodDbSNP extends ReferenceOrderedDatum implements AllelicVariant { +public class rodDbSNP extends BasicReferenceOrderedDatum implements AllelicVariant { public GenomeLoc loc; // genome location of SNP // Reference sequence chromosome or scaffold // Start and stop positions in chrom @@ -128,7 +128,8 @@ public class rodDbSNP extends ReferenceOrderedDatum implements AllelicVariant { // ---------------------------------------------------------------------- public String toString() { return String.format("%s\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d", - getContig(), getStart(), getStop(), name, strand, refBases, observed, molType, + getLocation().getContig(), getLocation().getStart(), getLocation().getStop(), + name, strand, refBases, observed, molType, varType, validationStatus, avHet, avHetSE, func, locType, weight ); } @@ -147,11 +148,12 @@ public class rodDbSNP extends ReferenceOrderedDatum implements AllelicVariant { public String repl() { return String.format("%d\t%s\t%d\t%d\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d", - 585, getContig(), getStart()-1, getStop()-1, name, strand, refBases, refBases, observed, molType, + 585, getLocation().getContig(), getLocation().getStart()-1, getLocation().getStop()-1, + name, strand, refBases, refBases, observed, molType, varType, validationStatus, avHet, avHetSE, func, locType, weight ); } - public void parseLine(final String[] parts) { + public boolean parseLine(final Object header, final String[] parts) { try { String contig = parts[1]; long start = Long.parseLong(parts[2]) + 1; // The final is 0 based @@ -170,6 +172,7 @@ public class rodDbSNP extends ReferenceOrderedDatum implements AllelicVariant { func = parts[15]; locType = parts[16]; weight = Integer.parseInt(parts[17]); + return true; } catch ( RuntimeException e ) { System.out.printf(" Exception caught during parsing GFFLine %s%n", Utils.join(" <=> ", parts)); throw e; diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/rodGFF.java b/java/src/org/broadinstitute/sting/gatk/refdata/rodGFF.java index 41dfd360b..752c907bb 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/rodGFF.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/rodGFF.java @@ -18,7 +18,7 @@ import org.broadinstitute.sting.utils.Utils; * Time: 10:47:14 AM * To change this template use File | Settings | File Templates. */ -public class rodGFF extends ReferenceOrderedDatum { +public class rodGFF extends BasicReferenceOrderedDatum { private String contig, source, feature, strand, frame; private long start, stop; private double score; @@ -134,7 +134,7 @@ public class rodGFF extends ReferenceOrderedDatum { return attributes; } - public void parseLine(final String[] parts) { + public boolean parseLine(final Object header, final String[] parts) { //System.out.printf("Parsing GFFLine %s%n", Utils.join(" ", parts)); final String contig = parts[0]; @@ -152,5 +152,6 @@ public class rodGFF extends ReferenceOrderedDatum { final String attributeParts = Utils.join(" ", parts, 8, parts.length); HashMap attributes = parseAttributes(attributeParts); setValues(contig, source, feature, start, stop, score, strand, frame, attributes); + return true; } } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java b/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java index 99ba087e5..1249f6879 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java @@ -25,7 +25,7 @@ import org.broadinstitute.sting.utils.Pileup; * Time: 2:58:33 PM * To change this template use File | Settings | File Templates. */ -public class rodSAMPileup extends ReferenceOrderedDatum implements Genotype, Pileup { +public class rodSAMPileup extends BasicReferenceOrderedDatum implements Genotype, Pileup { private static final int NO_VARIANT = -1; private static final int SNP_VARIANT = 0; private static final int INSERTION_VARIANT = 1; @@ -69,7 +69,7 @@ public class rodSAMPileup extends ReferenceOrderedDatum implements Genotype, Pil } @Override - public void parseLine(final String[] parts) { + public boolean parseLine(final Object header, final String[] parts) { // 0 1 2 3 4 5 6 7 // * chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow) // * chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12 6 @@ -134,6 +134,7 @@ public class rodSAMPileup extends ReferenceOrderedDatum implements Genotype, Pil System.out.printf(" Exception caught during parsing BasicPileup line: %s%n", Utils.join(" <=> ", parts)); throw e; } + return true; // if ( nNonref > 1 ) System.out.println("SAM pileup: WARNING: multi-allelic variant : ("+refBaseChar+") -->"+toMediumString()); } @@ -319,7 +320,7 @@ public class rodSAMPileup extends ReferenceOrderedDatum implements Genotype, Pil // ---------------------------------------------------------------------- public String toString() { return String.format("%s\t%d\t%d\t%s\t%s\t%s", - getContig(), getStart(), getStop(), name, refBases, observedString); + getLocation().getContig(), getLocation().getStart(), getLocation().getStop(), name, refBases, observedString); } public String toSimpleString() { diff --git a/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java b/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java index 1f1d0d267..925c6c8f6 100644 --- a/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java +++ b/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java @@ -51,7 +51,7 @@ public class DupUtils { copy.setInferredInsertSize(read.getInferredInsertSize()); copy.setMappingQuality(read.getMappingQuality()); copy.setCigar(read.getCigar()); - copy.setFlags(copy.getFlags()); + copy.setFlags(read.getFlags()); return copy; } diff --git a/java/test/org/broadinstitute/sting/BaseTest.java b/java/test/org/broadinstitute/sting/BaseTest.java index a22fcdc53..7b9abce55 100755 --- a/java/test/org/broadinstitute/sting/BaseTest.java +++ b/java/test/org/broadinstitute/sting/BaseTest.java @@ -42,7 +42,8 @@ public abstract class BaseTest { protected static String seqLocation = "/seq"; protected static String oneKGLocation = "/broad/1KG"; - + protected static String testDir = "testdata/"; + protected static boolean alreadySetup = false; private static long startTime; diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/TabularRODTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/TabularRODTest.java new file mode 100755 index 000000000..053df9cae --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/refdata/TabularRODTest.java @@ -0,0 +1,121 @@ +// our package +package org.broadinstitute.sting.gatk.refdata; + + +// the imports for unit testing. + +import org.junit.*; +import static org.junit.Assert.assertTrue; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2; +import org.broadinstitute.sting.utils.RefHanger; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.io.File; +import java.util.Arrays; +import java.util.List; + +/** + * Basic unit test for TabularROD + * + */ +public class TabularRODTest extends BaseTest { + private static FastaSequenceFile2 seq; + private ReferenceOrderedData ROD; + private ReferenceOrderedData.RODIterator iter; + private ReferenceOrderedData ROD2; + private ReferenceOrderedData.RODIterator iter2; + + @BeforeClass + public static void init() { + // sequence + seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")); + GenomeLoc.setupRefContigOrdering(seq); + } + + @Before + public void setupTabularROD() { + File file = new File(testDir + "TabularDataTest.dat"); + ROD = new ReferenceOrderedData("tableTest", file, TabularROD.class); + iter = ROD.iterator(); + + File file2 = new File(testDir + "TabularDataTest2.dat"); + ROD2 = new ReferenceOrderedData("tableTest", file2, TabularROD.class); + iter2 = ROD2.iterator(); + } + + @Test + public void test1() { + logger.warn("Executing test1"); + TabularROD one = (TabularROD)iter.next(); + assertTrue(one.size() == 3); + assertTrue(one.getLocation().equals(new GenomeLoc("chrM", 10))); + assertTrue(one.get("COL1").equals("A")); + assertTrue(one.get("COL2").equals("B")); + assertTrue(one.get("COL3").equals("C")); + } + + @Test + public void test2() { + logger.warn("Executing test2"); + TabularROD one = (TabularROD)iter.next(); + TabularROD two = (TabularROD)iter.next(); + assertTrue(two.size() == 3); + assertTrue(two.getLocation().equals(new GenomeLoc("chrM", 20))); + assertTrue(two.get("COL1").equals("C")); + assertTrue(two.get("COL2").equals("D")); + assertTrue(two.get("COL3").equals("E")); + } + + @Test + public void test3() { + logger.warn("Executing test3"); + TabularROD one = (TabularROD)iter.next(); + TabularROD two = (TabularROD)iter.next(); + TabularROD three = (TabularROD)iter.next(); + assertTrue(three.size() == 3); + assertTrue(three.getLocation().equals(new GenomeLoc("chrM", 30))); + assertTrue(three.get("COL1").equals("F")); + assertTrue(three.get("COL2").equals("G")); + assertTrue(three.get("COL3").equals("H")); + } + + @Test + public void testDone() { + logger.warn("Executing testDone"); + TabularROD one = (TabularROD)iter.next(); + TabularROD two = (TabularROD)iter.next(); + TabularROD three = (TabularROD)iter.next(); + assertTrue(!iter.hasNext()); + } + + @Test + public void testSeek() { + logger.warn("Executing testSeek"); + TabularROD two = (TabularROD)iter.seekForward(new GenomeLoc("chrM", 20)); + assertTrue(two.size() == 3); + assertTrue(two.getLocation().equals(new GenomeLoc("chrM", 20))); + assertTrue(two.get("COL1").equals("C")); + assertTrue(two.get("COL2").equals("D")); + assertTrue(two.get("COL3").equals("E")); + } + + @Test + public void testToString() { + logger.warn("Executing testToString"); + TabularROD one = (TabularROD)iter.next(); + assertTrue(one.toString().equals("chrM:10\tA\tB\tC")); + } + + @Test + public void test2p1() { + logger.warn("Executing test2p1"); + TabularROD one2 = (TabularROD)iter2.next(); + assertTrue(one2.size() == 4); + assertTrue(one2.getLocation().equals(new GenomeLoc("chrM", 10))); + assertTrue(one2.get("COL1").equals("A")); + assertTrue(one2.get("COL2").equals("B")); + assertTrue(one2.get("COL3").equals("C")); + assertTrue(one2.get("COL4").equals("1")); + } +} \ No newline at end of file diff --git a/python/picard_utils.py b/python/picard_utils.py new file mode 100755 index 000000000..fe271bbf7 --- /dev/null +++ b/python/picard_utils.py @@ -0,0 +1,51 @@ +import farm_commands +import os.path +import sys +from optparse import OptionParser +import string +import re +import glob + +#lanes = ["30JW3AAXX.6", "30KRNAAXX.1", "30KRNAAXX.6", "30PYMAAXX.5"] +#idsList = ['NA12843', 'NA19065', 'NA19064', 'NA18637'] + +lanes = ["30JW3AAXX.6", "30PYMAAXX.5", "30PNUAAXX.8", "30PPJAAXX.5"] +idsList = ['NA12843', 'NA18637', "NA19058", "NA12842"] +ids = dict(zip(lanes, idsList)) +gatkPath = "~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar" +ref = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta" +analysis = "CombineDuplicates" + +def getPicardPath(lane, picardRoot = '/seq/picard/'): + flowcell, laneNo = lane.split('.') + filePat = os.path.join(picardRoot, flowcell, '*', laneNo, '*') + dirs = glob.glob(filePat) + print dirs + if len(dirs) > 1: + system.exit("Bad lane -- too many directories matching pattern " + filePat) + return dirs[0] + +def getReferenceGenotypeFileFromConcordanceFile(concordFile): + # REFERENCE_GENOTYPES=/seq/references/reference_genotypes/hapmap/Homo_sapiens_assembly18/NA19058.geli + p = re.compile('REFERENCE_GENOTYPES=([/.\w]+)') + for line in open(concordFile): + match = p.search(line) + print 'Match is', line, match + if match <> None: + return match.group(1) + return None + +def concord(options, geli, output, genotypeFile): + return ("java -jar /seq/software/picard/current/bin/CollectGenotypeConcordanceStatistics.jar OPTIONS_FILE=%s INPUT=%s OUTPUT=%s REFERENCE_GENOTYPES=%s MINIMUM_LOD=5.0" % ( options, geli, output, genotypeFile ) ) + +def readPicardConcordance(file): + p = re.compile('HOMOZYGOUS_REFERENCE|HETEROZYGOUS|HOMOZYGOUS_NON_REFERENCE') +# CATEGORY OBSERVATIONS AGREE DISAGREE PCT_CONCORDANCE +# HOMOZYGOUS_REFERENCE 853 853 0 1 +# HETEROZYGOUS 416 413 3 0.992788 +# HOMOZYGOUS_NON_REFERENCE 235 231 4 0.982979 + types = [str, int, int, int, float] + def parse1(line): + return [f(x) for f, x in zip(types, line.split())] + data = [parse1(line) for line in open(file) if p.match(line) <> None] + return data diff --git a/testdata/TabularDataTest.dat b/testdata/TabularDataTest.dat new file mode 100755 index 000000000..99264a087 --- /dev/null +++ b/testdata/TabularDataTest.dat @@ -0,0 +1,7 @@ +# comment +# comment line +# +HEADER COL1 COL2 COL3 +chrM:10 A B C +chrM:20 C D E +chrM:30 F G H \ No newline at end of file diff --git a/testdata/TabularDataTest2.dat b/testdata/TabularDataTest2.dat new file mode 100755 index 000000000..8d8768b53 --- /dev/null +++ b/testdata/TabularDataTest2.dat @@ -0,0 +1,7 @@ +# comment +# comment line +# +HEADER COL1 COL2 COL3 COL4 +chrM:10 A B C 1 +chrM:20 C D E 2 +chrM:30 F G H 3 \ No newline at end of file