From 8e9e2f4502066c8666c71d93f9c701e1c1866052 Mon Sep 17 00:00:00 2001 From: depristo Date: Thu, 14 May 2009 21:06:28 +0000 Subject: [PATCH] Revised ROD system. Split the system in Basic type and interface. Enabled more control over rod accessing, including an initialize() function to fetch headers and other options from the file. Added general tabular rod, which has a named columns and supports a map interface. Comes with shiny new Junit system for RODs. Also, added simple python script for accessing picard data. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@716 348d0f76-0448-11de-a6fe-93d51630548a --- .../refdata/BasicReferenceOrderedDatum.java | 45 +++++ .../refdata/HapMapAlleleFrequenciesROD.java | 7 +- .../gatk/refdata/ReferenceOrderedData.java | 109 ++++++----- .../gatk/refdata/ReferenceOrderedDatum.java | 45 +++-- .../sting/gatk/refdata/TabularROD.java | 171 ++++++++++++++++++ .../sting/gatk/refdata/rodDbSNP.java | 11 +- .../sting/gatk/refdata/rodGFF.java | 5 +- .../sting/gatk/refdata/rodSAMPileup.java | 7 +- .../sting/utils/duplicates/DupUtils.java | 2 +- .../org/broadinstitute/sting/BaseTest.java | 3 +- .../sting/gatk/refdata/TabularRODTest.java | 121 +++++++++++++ python/picard_utils.py | 51 ++++++ testdata/TabularDataTest.dat | 7 + testdata/TabularDataTest2.dat | 7 + 14 files changed, 509 insertions(+), 82 deletions(-) create mode 100755 java/src/org/broadinstitute/sting/gatk/refdata/BasicReferenceOrderedDatum.java create mode 100755 java/src/org/broadinstitute/sting/gatk/refdata/TabularROD.java create mode 100755 java/test/org/broadinstitute/sting/gatk/refdata/TabularRODTest.java create mode 100755 python/picard_utils.py create mode 100755 testdata/TabularDataTest.dat create mode 100755 testdata/TabularDataTest2.dat diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/BasicReferenceOrderedDatum.java b/java/src/org/broadinstitute/sting/gatk/refdata/BasicReferenceOrderedDatum.java new file mode 100755 index 000000000..44e3d5c2e --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/BasicReferenceOrderedDatum.java @@ -0,0 +1,45 @@ +package org.broadinstitute.sting.gatk.refdata; + +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; + +/** + * Created by IntelliJ IDEA. + * User: mdepristo + * Date: Feb 27, 2009 + * Time: 10:49:47 AM + * To change this template use File | Settings | File Templates. + */ +public abstract class BasicReferenceOrderedDatum implements ReferenceOrderedDatum { + protected String name; + + public BasicReferenceOrderedDatum(String name) { + this.name = name; + } + + public String getName() { return this.name; } + + public abstract boolean parseLine(final Object header, final String[] parts) throws IOException; + + public abstract String toString(); + public String toSimpleString() { return toString(); } + public String repl() { return this.toString(); } + + public String delimiter() { + return "\t"; + } + + public abstract GenomeLoc getLocation(); + + public int compareTo( ReferenceOrderedDatum that ) { + return getLocation().compareTo(that.getLocation()); + } + + public Object initialize(final File source) throws FileNotFoundException { + //System.out.printf("Initialize called with %s%n", source); + return null; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/HapMapAlleleFrequenciesROD.java b/java/src/org/broadinstitute/sting/gatk/refdata/HapMapAlleleFrequenciesROD.java index be677e9d7..443b90b7e 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/HapMapAlleleFrequenciesROD.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/HapMapAlleleFrequenciesROD.java @@ -11,7 +11,7 @@ import edu.mit.broad.picard.util.SequenceUtil; /** * ReferenceOrderedDatum class to hold HapMap AlleleFrequency Data */ -public class HapMapAlleleFrequenciesROD extends ReferenceOrderedDatum { +public class HapMapAlleleFrequenciesROD extends BasicReferenceOrderedDatum { public GenomeLoc loc; // genome location of SNP // Reference sequence chromosome or scaffold // Start and stop positions in chrom @@ -46,7 +46,7 @@ public class HapMapAlleleFrequenciesROD extends ReferenceOrderedDatum { return String.format( "%s\t%s\t%s\t%d\t%s\t%s\t%s\t%s\t%d\t%d\t%1.3f\t%1.3f\t%d", - rsNumber, hgBuild, getContig(), getStart(), strand, alleles, refAllele, varAllele, + rsNumber, hgBuild, getLocation().getContig(), getLocation().getStart(), strand, alleles, refAllele, varAllele, refCounts, varCounts, refFreq, varFreq, totalCounts); } @@ -58,7 +58,7 @@ public class HapMapAlleleFrequenciesROD extends ReferenceOrderedDatum { return toString(); } - public void parseLine(final String[] parts) { + public boolean parseLine(final Object header, final String[] parts) { try { // rs11511647 <=> HG18 <=> chr10 <=> 62765 <=> + <=> T/C <=> T <=> C <=> 21 <=> 97 <=> 0.178 <=> 0.822 <=> 118 @@ -85,6 +85,7 @@ public class HapMapAlleleFrequenciesROD extends ReferenceOrderedDatum { System.out.printf(" Exception caught during parsing HapMap Allele Freq %s%n", Utils.join(" <=> ", parts)); throw e; } + return true; } public double getVarAlleleFreq() { return this.varFreq; } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java index b629e6911..3602c1834 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedData.java @@ -25,6 +25,13 @@ import org.apache.log4j.Logger; public class ReferenceOrderedData implements Iterable { private String name; private File file = null; + private String fieldDelimiter; + + /** + * Header object returned from the datum + */ + private Object header = null; + private Class type = null; // runtime type information for object construction // ---------------------------------------------------------------------- @@ -53,6 +60,7 @@ public class ReferenceOrderedData implements addModule("dbSNP", rodDbSNP.class); addModule("HapMapAlleleFrequencies", HapMapAlleleFrequenciesROD.class); addModule("SAMPileup", rodSAMPileup.class); + addModule("Table", TabularROD.class); } @@ -124,6 +132,8 @@ public class ReferenceOrderedData implements this.file = file; this.type = type; this.name = name; + this.header = initializeROD(name, file, type); + this.fieldDelimiter = newROD(name, type).delimiter(); } public String getName() { return name; } @@ -218,8 +228,9 @@ public class ReferenceOrderedData implements public ROD next() { final String line = parser.next(); //System.out.printf("Line is %s%n", line); - String parts[] = line.split("\t"); - return parseLine(parts); + String parts[] = line.split(fieldDelimiter); + ROD n = parseLine(parts); + return n != null ? n : next(); } public void remove() { @@ -263,52 +274,41 @@ public class ReferenceOrderedData implements return prev; } - // - // Seeks forward in the file until we reach (or cross) a record at contig / pos - // If we don't find anything and cross beyond contig / pos, we return null - // Otherwise we return the first object who's start is at pos - // + /** + * Seeks forward in the file until we reach (or cross) a record at contig / pos + * If we don't find anything and cross beyond contig / pos, we return null + * Otherwise we return the first object who's start is at pos + * + * @param loc + * @return + */ public ROD seekForward(final GenomeLoc loc) { - return seekForward(loc.getContig(), loc.getStart()); - } - - protected ROD seekForward(final String contigName, final long pos) { - final boolean DEBUG = false; + final boolean DEBUG = false; ROD result = null; - if ( DEBUG ) System.out.printf(" *** starting seek to %s %d %s%n", contigName, pos, prev); + if ( DEBUG ) System.out.printf(" *** starting seek to %s %d %s%n", loc.getContig(), loc.getStart(), prev); while ( hasNext() ) { ROD current = next(); //System.out.printf(" -> Seeking to %s %d AT %s %d%n", contigName, pos, current.getContig(), current.getStart()); - int strCmp = GenomeLoc.compareContigs( contigName, prev.getContig() );// contigName.compareTo( prev.getContig() ); - if ( strCmp == 0 ) { - // The contigs are equal - if ( current.getStart() > pos ) { - // There was nothing to find, push back next and return null - it.pushback(current); - break; - } - else if ( pos == current.getStart() ) { - // We found a record at contig / pos, return it - result = current; - break; - } + int cmp = current.getLocation().compareTo(loc); + if ( cmp < 0 ) { + // current occurs before loc, continue searching + continue; } - else if ( strCmp < 0 ) { - if ( DEBUG ) System.out.printf(" -> Jumped to contig %s%n", contigName); - // We've gone past the desired contig, break + else if ( cmp == 0 ) { + result = current; + break; + } else { + // current is after loc it.pushback(current); break; } } if ( DEBUG ) { - if ( result == null ) - ; - //System.out.printf(" --- seek result to %s %d is NULL%n", contigName, pos); - else - System.out.printf(" ### Found %s %d%n", result.getContig(), result.getStart()); + if ( result != null ) + System.out.printf(" ### Found %s%n", result.getLocation()); } @@ -326,26 +326,39 @@ public class ReferenceOrderedData implements // Parsing // // ---------------------------------------------------------------------- - ROD parseLine(final String[] parts) { - //System.out.printf("Parsing GFFLine %s%n", Utils.join(" ", parts)); + private ROD newROD( final String name, final Class type ) { try { - //ROD obj = type.newInstance(); Constructor c = type.getConstructor(String.class); - ROD obj = (ROD)c.newInstance(name); - obj.parseLine(parts); - return obj; + return (ROD)c.newInstance(name); } catch ( java.lang.InstantiationException e ) { - System.out.println(e); - return null; // wow, unsafe! + throw new RuntimeException(e); } catch ( java.lang.IllegalAccessException e ) { - System.out.println(e); - return null; // wow, unsafe! + throw new RuntimeException(e); } catch ( java.lang.NoSuchMethodException e ) { - System.out.println(e); - return null; // wow, unsafe! + throw new RuntimeException(e); } catch ( InvocationTargetException e ) { - System.out.println(e); - return null; // wow, unsafe! + throw new RuntimeException(e); } } + + private Object initializeROD(final String name, final File file, final Class type) { + ROD rod = newROD(name, type); + try { + return rod.initialize(file); + } catch ( FileNotFoundException e ) { + throw new RuntimeException(e); + } + } + + private ROD parseLine(final String[] parts) { + //System.out.printf("Parsing GFFLine %s%n", Utils.join(" ", parts)); + ROD obj = newROD(name, type); + try { + if ( ! obj.parseLine(header, parts) ) + obj = null; + } catch (IOException e) { + throw new RuntimeException("Badly formed ROD: " + e); + } + return obj; + } } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java index 9c34b4ab9..8061a7a27 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java @@ -2,6 +2,10 @@ package org.broadinstitute.sting.gatk.refdata; import org.broadinstitute.sting.utils.GenomeLoc; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; + /** * Created by IntelliJ IDEA. * User: mdepristo @@ -9,27 +13,28 @@ import org.broadinstitute.sting.utils.GenomeLoc; * Time: 10:49:47 AM * To change this template use File | Settings | File Templates. */ -public abstract class ReferenceOrderedDatum implements Comparable { - protected String name; +public interface ReferenceOrderedDatum extends Comparable { + public String getName(); + public boolean parseLine(final Object header, final String[] parts) throws IOException; + public String toString(); + public String toSimpleString(); + public String repl(); - public ReferenceOrderedDatum(String name) { - this.name = name; - } + /** + * Used by the ROD system to determine how to split input lines + * @return Regex string delimiter separating fields + */ + public String delimiter(); - public String getName() { return this.name; } + public GenomeLoc getLocation(); + public int compareTo( ReferenceOrderedDatum that ); - public abstract void parseLine(final String[] parts); - - public abstract String toString(); - public abstract String toSimpleString(); - public abstract String repl(); - - public abstract GenomeLoc getLocation(); - public int compareTo( ReferenceOrderedDatum that ) { - return getLocation().compareTo(that.getLocation()); - } - - public final String getContig() { return getLocation().getContig(); } - public final long getStart() { return getLocation().getStart(); } - public final long getStop() { return getLocation().getStop(); } + /** + * Backdoor hook to read header, meta-data, etc. associated with the file. Will be + * called by the ROD system before streaming starts + * + * @param source source data file on disk from which this rod stream will be pulled + * @return a header object that will be passed to parseLine command + */ + public Object initialize(final File source) throws FileNotFoundException; } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/TabularROD.java b/java/src/org/broadinstitute/sting/gatk/refdata/TabularROD.java new file mode 100755 index 000000000..3441fafe7 --- /dev/null +++ b/java/src/org/broadinstitute/sting/gatk/refdata/TabularROD.java @@ -0,0 +1,171 @@ +package org.broadinstitute.sting.gatk.refdata; + +import java.util.*; +import java.util.regex.MatchResult; +import java.util.regex.Pattern; +import java.util.regex.Matcher; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; + +import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; +import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.Utils; +import org.broadinstitute.sting.utils.xReadLines; +import org.apache.log4j.Logger; + +/** + * Class for representing arbitrary reference ordered data sets + * + * User: mdepristo + * Date: Feb 27, 2009 + * Time: 10:47:14 AM + * To change this template use File | Settings | File Templates. + */ +public class TabularROD extends BasicReferenceOrderedDatum implements Map { + private static Logger logger = Logger.getLogger(TabularROD.class); + + private GenomeLoc loc; + private HashMap attributes; + private ArrayList header; + + // ---------------------------------------------------------------------- + // + // Constructors + // + // ---------------------------------------------------------------------- + public TabularROD(final String name) { + super(name); + attributes = new HashMap(); + } + + /** + * Walks through the source files looking for the header line, which it returns as a + * list of strings. + * + * @param source + * @return + */ + public Object initialize(final File source) throws FileNotFoundException { + List header = null; + int linesLookedAt = 0; + xReadLines reader = new xReadLines(source); + + for ( String line : reader ) { + Matcher m = HEADER_PATTERN.matcher(line); + if ( m.matches() ) { + //System.out.printf("Found a header line: %s%n", line); + header = new ArrayList(Arrays.asList(line.split("\\s+"))); + } + + if ( linesLookedAt++ > MAX_LINES_TO_LOOK_FOR_HEADER ) + break; + } + + // check that we found the header + if ( header != null ) { + logger.debug(String.format("HEADER IS %s%n", Utils.join(":", header))); + } else { + throw new RuntimeException("Couldn't find header line in TabularROD!"); + } + + //System.exit(1); + return header; + } + + private static int MAX_LINES_TO_LOOK_FOR_HEADER = 1000; + private static Pattern HEADER_PATTERN = Pattern.compile("^\\s*HEADER.*"); + private static Pattern COMMENT_PATTERN = Pattern.compile("^#.*"); + + // ---------------------------------------------------------------------- + // + // Accessors + // + // ---------------------------------------------------------------------- + public GenomeLoc getLocation() { + return loc; + } + + public String get(final Object key) { + return attributes.get(key); + } + + public String put(final String key, final String object) { + return attributes.put(key, object); + } + + public boolean containsKey(final Object key) { + return attributes.containsKey(key); + } + + public HashMap getAttributes() { + return attributes; + } + + public String getAttributeString() { + List strings = new ArrayList(attributes.size()); + for ( String key : header ) { + if ( containsKey(key) ) { // avoid the header + strings.add(this.get(key)); + System.out.printf("Adding %s%n", this.get(key)); + } + } + return Utils.join("\t", strings); + } + + // ---------------------------------------------------------------------- + // + // map functions + // + // ---------------------------------------------------------------------- + public int size() { return attributes.size(); } + public boolean isEmpty() { return attributes.isEmpty(); } + public boolean containsValue(Object o) { return attributes.containsValue(o); } + public String remove(Object o) { return attributes.remove(o); } + public void clear() { attributes.clear(); } + public java.util.Set keySet() { return attributes.keySet(); } + public java.util.Collection values() { return attributes.values(); } + + public void putAll(java.util.Map map) { + attributes.putAll(map); + } + + public java.util.Set> entrySet() { + return attributes.entrySet(); + } + + // ---------------------------------------------------------------------- + // + // formatting + // + // ---------------------------------------------------------------------- + public String toString() { + return String.format("%s\t%s", getLocation(), getAttributeString()); + } + + public String delimiter() { + return "\\s+"; + } + + public boolean parseLine(final Object headerObj, final String[] parts) throws IOException { + header = (ArrayList)(headerObj); + + //System.out.printf("parts [len=%d] is '%s'%n", parts.length, Utils.join(":", parts)); + + if ( parts.length == 0 || COMMENT_PATTERN.matcher(parts[0]).matches() || HEADER_PATTERN.matcher(parts[0]).matches() ) + return false; + + if (header.size() != parts.length) { + throw new IOException(String.format("Header length %d not equal to Tabular parts length %d", header.size(), parts.length)); + } + + loc = GenomeLoc.parseGenomeLoc(parts[0]); + for ( int i = 1; i < parts.length; i++ ) { + put(header.get(i), parts[i]); + } + + //System.out.printf("Parsed %s%n", this); + + return true; + } +} \ No newline at end of file diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java b/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java index 0cb0093db..7c9753a78 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/rodDbSNP.java @@ -19,7 +19,7 @@ import org.broadinstitute.sting.gatk.refdata.AllelicVariant; * Time: 10:47:14 AM * To change this template use File | Settings | File Templates. */ -public class rodDbSNP extends ReferenceOrderedDatum implements AllelicVariant { +public class rodDbSNP extends BasicReferenceOrderedDatum implements AllelicVariant { public GenomeLoc loc; // genome location of SNP // Reference sequence chromosome or scaffold // Start and stop positions in chrom @@ -128,7 +128,8 @@ public class rodDbSNP extends ReferenceOrderedDatum implements AllelicVariant { // ---------------------------------------------------------------------- public String toString() { return String.format("%s\t%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d", - getContig(), getStart(), getStop(), name, strand, refBases, observed, molType, + getLocation().getContig(), getLocation().getStart(), getLocation().getStop(), + name, strand, refBases, observed, molType, varType, validationStatus, avHet, avHetSE, func, locType, weight ); } @@ -147,11 +148,12 @@ public class rodDbSNP extends ReferenceOrderedDatum implements AllelicVariant { public String repl() { return String.format("%d\t%s\t%d\t%d\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\t%f\t%s\t%s\t%d", - 585, getContig(), getStart()-1, getStop()-1, name, strand, refBases, refBases, observed, molType, + 585, getLocation().getContig(), getLocation().getStart()-1, getLocation().getStop()-1, + name, strand, refBases, refBases, observed, molType, varType, validationStatus, avHet, avHetSE, func, locType, weight ); } - public void parseLine(final String[] parts) { + public boolean parseLine(final Object header, final String[] parts) { try { String contig = parts[1]; long start = Long.parseLong(parts[2]) + 1; // The final is 0 based @@ -170,6 +172,7 @@ public class rodDbSNP extends ReferenceOrderedDatum implements AllelicVariant { func = parts[15]; locType = parts[16]; weight = Integer.parseInt(parts[17]); + return true; } catch ( RuntimeException e ) { System.out.printf(" Exception caught during parsing GFFLine %s%n", Utils.join(" <=> ", parts)); throw e; diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/rodGFF.java b/java/src/org/broadinstitute/sting/gatk/refdata/rodGFF.java index 41dfd360b..752c907bb 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/rodGFF.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/rodGFF.java @@ -18,7 +18,7 @@ import org.broadinstitute.sting.utils.Utils; * Time: 10:47:14 AM * To change this template use File | Settings | File Templates. */ -public class rodGFF extends ReferenceOrderedDatum { +public class rodGFF extends BasicReferenceOrderedDatum { private String contig, source, feature, strand, frame; private long start, stop; private double score; @@ -134,7 +134,7 @@ public class rodGFF extends ReferenceOrderedDatum { return attributes; } - public void parseLine(final String[] parts) { + public boolean parseLine(final Object header, final String[] parts) { //System.out.printf("Parsing GFFLine %s%n", Utils.join(" ", parts)); final String contig = parts[0]; @@ -152,5 +152,6 @@ public class rodGFF extends ReferenceOrderedDatum { final String attributeParts = Utils.join(" ", parts, 8, parts.length); HashMap attributes = parseAttributes(attributeParts); setValues(contig, source, feature, start, stop, score, strand, frame, attributes); + return true; } } diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java b/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java index 99ba087e5..1249f6879 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/rodSAMPileup.java @@ -25,7 +25,7 @@ import org.broadinstitute.sting.utils.Pileup; * Time: 2:58:33 PM * To change this template use File | Settings | File Templates. */ -public class rodSAMPileup extends ReferenceOrderedDatum implements Genotype, Pileup { +public class rodSAMPileup extends BasicReferenceOrderedDatum implements Genotype, Pileup { private static final int NO_VARIANT = -1; private static final int SNP_VARIANT = 0; private static final int INSERTION_VARIANT = 1; @@ -69,7 +69,7 @@ public class rodSAMPileup extends ReferenceOrderedDatum implements Genotype, Pil } @Override - public void parseLine(final String[] parts) { + public boolean parseLine(final Object header, final String[] parts) { // 0 1 2 3 4 5 6 7 // * chrX 466 T Y 170 170 88 32 ... (piles of read bases and quals follow) // * chrX 141444 * +CA/+CA 32 468 255 25 +CA * 5 2 12 6 @@ -134,6 +134,7 @@ public class rodSAMPileup extends ReferenceOrderedDatum implements Genotype, Pil System.out.printf(" Exception caught during parsing BasicPileup line: %s%n", Utils.join(" <=> ", parts)); throw e; } + return true; // if ( nNonref > 1 ) System.out.println("SAM pileup: WARNING: multi-allelic variant : ("+refBaseChar+") -->"+toMediumString()); } @@ -319,7 +320,7 @@ public class rodSAMPileup extends ReferenceOrderedDatum implements Genotype, Pil // ---------------------------------------------------------------------- public String toString() { return String.format("%s\t%d\t%d\t%s\t%s\t%s", - getContig(), getStart(), getStop(), name, refBases, observedString); + getLocation().getContig(), getLocation().getStart(), getLocation().getStop(), name, refBases, observedString); } public String toSimpleString() { diff --git a/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java b/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java index 1f1d0d267..925c6c8f6 100644 --- a/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java +++ b/java/src/org/broadinstitute/sting/utils/duplicates/DupUtils.java @@ -51,7 +51,7 @@ public class DupUtils { copy.setInferredInsertSize(read.getInferredInsertSize()); copy.setMappingQuality(read.getMappingQuality()); copy.setCigar(read.getCigar()); - copy.setFlags(copy.getFlags()); + copy.setFlags(read.getFlags()); return copy; } diff --git a/java/test/org/broadinstitute/sting/BaseTest.java b/java/test/org/broadinstitute/sting/BaseTest.java index a22fcdc53..7b9abce55 100755 --- a/java/test/org/broadinstitute/sting/BaseTest.java +++ b/java/test/org/broadinstitute/sting/BaseTest.java @@ -42,7 +42,8 @@ public abstract class BaseTest { protected static String seqLocation = "/seq"; protected static String oneKGLocation = "/broad/1KG"; - + protected static String testDir = "testdata/"; + protected static boolean alreadySetup = false; private static long startTime; diff --git a/java/test/org/broadinstitute/sting/gatk/refdata/TabularRODTest.java b/java/test/org/broadinstitute/sting/gatk/refdata/TabularRODTest.java new file mode 100755 index 000000000..053df9cae --- /dev/null +++ b/java/test/org/broadinstitute/sting/gatk/refdata/TabularRODTest.java @@ -0,0 +1,121 @@ +// our package +package org.broadinstitute.sting.gatk.refdata; + + +// the imports for unit testing. + +import org.junit.*; +import static org.junit.Assert.assertTrue; +import org.broadinstitute.sting.BaseTest; +import org.broadinstitute.sting.utils.fasta.FastaSequenceFile2; +import org.broadinstitute.sting.utils.RefHanger; +import org.broadinstitute.sting.utils.GenomeLoc; + +import java.io.File; +import java.util.Arrays; +import java.util.List; + +/** + * Basic unit test for TabularROD + * + */ +public class TabularRODTest extends BaseTest { + private static FastaSequenceFile2 seq; + private ReferenceOrderedData ROD; + private ReferenceOrderedData.RODIterator iter; + private ReferenceOrderedData ROD2; + private ReferenceOrderedData.RODIterator iter2; + + @BeforeClass + public static void init() { + // sequence + seq = new FastaSequenceFile2(new File(seqLocation + "/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")); + GenomeLoc.setupRefContigOrdering(seq); + } + + @Before + public void setupTabularROD() { + File file = new File(testDir + "TabularDataTest.dat"); + ROD = new ReferenceOrderedData("tableTest", file, TabularROD.class); + iter = ROD.iterator(); + + File file2 = new File(testDir + "TabularDataTest2.dat"); + ROD2 = new ReferenceOrderedData("tableTest", file2, TabularROD.class); + iter2 = ROD2.iterator(); + } + + @Test + public void test1() { + logger.warn("Executing test1"); + TabularROD one = (TabularROD)iter.next(); + assertTrue(one.size() == 3); + assertTrue(one.getLocation().equals(new GenomeLoc("chrM", 10))); + assertTrue(one.get("COL1").equals("A")); + assertTrue(one.get("COL2").equals("B")); + assertTrue(one.get("COL3").equals("C")); + } + + @Test + public void test2() { + logger.warn("Executing test2"); + TabularROD one = (TabularROD)iter.next(); + TabularROD two = (TabularROD)iter.next(); + assertTrue(two.size() == 3); + assertTrue(two.getLocation().equals(new GenomeLoc("chrM", 20))); + assertTrue(two.get("COL1").equals("C")); + assertTrue(two.get("COL2").equals("D")); + assertTrue(two.get("COL3").equals("E")); + } + + @Test + public void test3() { + logger.warn("Executing test3"); + TabularROD one = (TabularROD)iter.next(); + TabularROD two = (TabularROD)iter.next(); + TabularROD three = (TabularROD)iter.next(); + assertTrue(three.size() == 3); + assertTrue(three.getLocation().equals(new GenomeLoc("chrM", 30))); + assertTrue(three.get("COL1").equals("F")); + assertTrue(three.get("COL2").equals("G")); + assertTrue(three.get("COL3").equals("H")); + } + + @Test + public void testDone() { + logger.warn("Executing testDone"); + TabularROD one = (TabularROD)iter.next(); + TabularROD two = (TabularROD)iter.next(); + TabularROD three = (TabularROD)iter.next(); + assertTrue(!iter.hasNext()); + } + + @Test + public void testSeek() { + logger.warn("Executing testSeek"); + TabularROD two = (TabularROD)iter.seekForward(new GenomeLoc("chrM", 20)); + assertTrue(two.size() == 3); + assertTrue(two.getLocation().equals(new GenomeLoc("chrM", 20))); + assertTrue(two.get("COL1").equals("C")); + assertTrue(two.get("COL2").equals("D")); + assertTrue(two.get("COL3").equals("E")); + } + + @Test + public void testToString() { + logger.warn("Executing testToString"); + TabularROD one = (TabularROD)iter.next(); + assertTrue(one.toString().equals("chrM:10\tA\tB\tC")); + } + + @Test + public void test2p1() { + logger.warn("Executing test2p1"); + TabularROD one2 = (TabularROD)iter2.next(); + assertTrue(one2.size() == 4); + assertTrue(one2.getLocation().equals(new GenomeLoc("chrM", 10))); + assertTrue(one2.get("COL1").equals("A")); + assertTrue(one2.get("COL2").equals("B")); + assertTrue(one2.get("COL3").equals("C")); + assertTrue(one2.get("COL4").equals("1")); + } +} \ No newline at end of file diff --git a/python/picard_utils.py b/python/picard_utils.py new file mode 100755 index 000000000..fe271bbf7 --- /dev/null +++ b/python/picard_utils.py @@ -0,0 +1,51 @@ +import farm_commands +import os.path +import sys +from optparse import OptionParser +import string +import re +import glob + +#lanes = ["30JW3AAXX.6", "30KRNAAXX.1", "30KRNAAXX.6", "30PYMAAXX.5"] +#idsList = ['NA12843', 'NA19065', 'NA19064', 'NA18637'] + +lanes = ["30JW3AAXX.6", "30PYMAAXX.5", "30PNUAAXX.8", "30PPJAAXX.5"] +idsList = ['NA12843', 'NA18637', "NA19058", "NA12842"] +ids = dict(zip(lanes, idsList)) +gatkPath = "~/dev/GenomeAnalysisTK/trunk/dist/GenomeAnalysisTK.jar" +ref = "/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta" +analysis = "CombineDuplicates" + +def getPicardPath(lane, picardRoot = '/seq/picard/'): + flowcell, laneNo = lane.split('.') + filePat = os.path.join(picardRoot, flowcell, '*', laneNo, '*') + dirs = glob.glob(filePat) + print dirs + if len(dirs) > 1: + system.exit("Bad lane -- too many directories matching pattern " + filePat) + return dirs[0] + +def getReferenceGenotypeFileFromConcordanceFile(concordFile): + # REFERENCE_GENOTYPES=/seq/references/reference_genotypes/hapmap/Homo_sapiens_assembly18/NA19058.geli + p = re.compile('REFERENCE_GENOTYPES=([/.\w]+)') + for line in open(concordFile): + match = p.search(line) + print 'Match is', line, match + if match <> None: + return match.group(1) + return None + +def concord(options, geli, output, genotypeFile): + return ("java -jar /seq/software/picard/current/bin/CollectGenotypeConcordanceStatistics.jar OPTIONS_FILE=%s INPUT=%s OUTPUT=%s REFERENCE_GENOTYPES=%s MINIMUM_LOD=5.0" % ( options, geli, output, genotypeFile ) ) + +def readPicardConcordance(file): + p = re.compile('HOMOZYGOUS_REFERENCE|HETEROZYGOUS|HOMOZYGOUS_NON_REFERENCE') +# CATEGORY OBSERVATIONS AGREE DISAGREE PCT_CONCORDANCE +# HOMOZYGOUS_REFERENCE 853 853 0 1 +# HETEROZYGOUS 416 413 3 0.992788 +# HOMOZYGOUS_NON_REFERENCE 235 231 4 0.982979 + types = [str, int, int, int, float] + def parse1(line): + return [f(x) for f, x in zip(types, line.split())] + data = [parse1(line) for line in open(file) if p.match(line) <> None] + return data diff --git a/testdata/TabularDataTest.dat b/testdata/TabularDataTest.dat new file mode 100755 index 000000000..99264a087 --- /dev/null +++ b/testdata/TabularDataTest.dat @@ -0,0 +1,7 @@ +# comment +# comment line +# +HEADER COL1 COL2 COL3 +chrM:10 A B C +chrM:20 C D E +chrM:30 F G H \ No newline at end of file diff --git a/testdata/TabularDataTest2.dat b/testdata/TabularDataTest2.dat new file mode 100755 index 000000000..8d8768b53 --- /dev/null +++ b/testdata/TabularDataTest2.dat @@ -0,0 +1,7 @@ +# comment +# comment line +# +HEADER COL1 COL2 COL3 COL4 +chrM:10 A B C 1 +chrM:20 C D E 2 +chrM:30 F G H 3 \ No newline at end of file