Merge branch 'master' into SamRecordFactory

2011-10-21 09:43:52 -04:00 · 2011-10-21 09:43:52 -04:00 · 110e13bc1e
parent be797a8a1f 9f867d77ca
commit 110e13bc1e
18 changed files with 381 additions and 69 deletions
--- a/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
+++ b/public/java/src/org/broadinstitute/sting/commandline/ArgumentTypeDescriptor.java
@ -379,8 +379,10 @@ class RodBindingArgumentTypeDescriptor extends ArgumentTypeDescriptor {
                    }

                    if ( tribbleType == null )
-                        if ( ! file.canRead() | ! file.isFile() ) {
-                            throw new UserException.BadArgumentValue(name, "Couldn't read file to determine type: " + file);
+                        if ( ! file.exists() ) {
+                            throw new UserException.CouldNotReadInputFile(file, "file does not exist");
+                        } else if ( ! file.canRead() || ! file.isFile() ) {
+                            throw new UserException.CouldNotReadInputFile(file, "file could not be read");
                        } else {
                            throw new UserException.CommandLineException(
                                    String.format("No tribble type was provided on the command line and the type of the file could not be determined dynamically. " +
--- a/public/java/src/org/broadinstitute/sting/gatk/filters/ReadNameFilter.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/filters/ReadNameFilter.java
@ -0,0 +1,23 @@
+package org.broadinstitute.sting.gatk.filters;
+
+import net.sf.samtools.Cigar;
+import net.sf.samtools.CigarElement;
+import net.sf.samtools.CigarOperator;
+import net.sf.samtools.SAMRecord;
+import org.broadinstitute.sting.commandline.Argument;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: chartl
+ * Date: 9/19/11
+ * Time: 4:09 PM
+ * To change this template use File | Settings | File Templates.
+ */
+public class ReadNameFilter extends ReadFilter {
+     @Argument(fullName = "readName", shortName = "rn", doc="Filter out all reads except those with this read name", required=true)
+    private String readName;
+
+    public boolean filterOut(final SAMRecord rec) {
+        return ! rec.getReadName().equals(readName);
+    }
+}
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/annotator/MVLikelihoodRatio.java
@ -0,0 +1,58 @@
+package org.broadinstitute.sting.gatk.walkers.annotator;
+
+import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
+import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.AnnotatorCompatibleWalker;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.ExperimentalAnnotation;
+import org.broadinstitute.sting.gatk.walkers.annotator.interfaces.InfoFieldAnnotation;
+import org.broadinstitute.sting.utils.BaseUtils;
+import org.broadinstitute.sting.utils.MendelianViolation;
+import org.broadinstitute.sting.utils.codecs.vcf.VCFFilterHeaderLine;
+import org.broadinstitute.sting.utils.codecs.vcf.VCFHeaderLineType;
+import org.broadinstitute.sting.utils.codecs.vcf.VCFInfoHeaderLine;
+import org.broadinstitute.sting.utils.exceptions.UserException;
+import org.broadinstitute.sting.utils.pileup.PileupElement;
+import org.broadinstitute.sting.utils.variantcontext.VariantContext;
+
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Created by IntelliJ IDEA.
+ * User: chartl
+ * Date: 9/14/11
+ * Time: 12:24 PM
+ * To change this template use File | Settings | File Templates.
+ */
+public class MVLikelihoodRatio extends InfoFieldAnnotation implements ExperimentalAnnotation {
+
+    private MendelianViolation mendelianViolation = null;
+
+    public Map<String, Object> annotate(RefMetaDataTracker tracker, AnnotatorCompatibleWalker walker, ReferenceContext ref, Map<String, AlignmentContext> stratifiedContexts, VariantContext vc) {
+        if ( mendelianViolation == null ) {
+            if ( walker instanceof VariantAnnotator ) {
+                mendelianViolation = new MendelianViolation(((VariantAnnotator)walker).familyStr, ((VariantAnnotator)walker).minGenotypeQualityP );
+            }
+            else {
+                throw new UserException("Mendelian violation annotation can only be used from the Variant Annotator");
+            }
+        }
+
+        Map<String,Object> toRet = new HashMap<String,Object>(1);
+        boolean hasAppropriateGenotypes = vc.hasGenotype(mendelianViolation.getSampleChild()) &&
+                vc.hasGenotype(mendelianViolation.getSampleDad()) &&
+                vc.hasGenotype(mendelianViolation.getSampleMom());
+        if ( hasAppropriateGenotypes )
+            toRet.put("MVLR",mendelianViolation.violationLikelihoodRatio(vc));
+
+        return toRet;
+    }
+
+    // return the descriptions used for the VCF INFO meta field
+    public List<String> getKeyNames() { return Arrays.asList("MVLR"); }
+
+    public List<VCFInfoHeaderLine> getDescriptions() { return Arrays.asList(new VCFInfoHeaderLine("MVLR", 1, VCFHeaderLineType.Float, "Mendelian violation likelihood ratio: L[MV] - L[No MV]")); }
+}
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/coverage/GCContentByIntervalWalker.java
@ -42,12 +42,12 @@ import java.util.List;
 *
 * <h2>Input</h2>
 * <p>
- * One or more BAM files.
+ *  A reference file
 * </p>
 *
 * <h2>Output</h2>
 * <p>
- * GC content calculations per interval.
+ *  GC content calculations per interval.
 * </p>
 *
 * <h2>Examples</h2>
@ -56,7 +56,6 @@ import java.util.List;
 *   -R ref.fasta \
 *   -T GCContentByInterval \
 *   -o output.txt \
- *   -I input.bam \
 *   -L input.intervals
 * </pre>
 *
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java
@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.walkers.variantutils;
 import org.broadinstitute.sting.commandline.*;
 import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
 import org.broadinstitute.sting.utils.MathUtils;
+import org.broadinstitute.sting.utils.variantcontext.Allele;
 import org.broadinstitute.sting.utils.variantcontext.VariantContext;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
@ -133,7 +134,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {

    /**
     * By default, this tool throws a UserException when it encounters a field without a value in some record.  This
-     * is generally useful when you mistype -F CHROM, so that you get a friendly warning about CHRMO not being
+     * is generally useful when you mistype -F CHROM, so that you get a friendly warning about CHROM not being
     * found before the tool runs through 40M 1000G records.  However, in some cases you genuinely want to allow such
     * fields (e.g., AC not being calculated for filtered records, if included).  When provided, this argument
     * will cause VariantsToTable to write out NA values for missing fields instead of throwing an error.
@ -294,6 +295,14 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
                return x.toString();
            }
        });
+        getters.put("EVENTLENGTH", new Getter() { public String get(VariantContext vc) {
+            int maxLength = 0;
+            for ( final Allele a : vc.getAlternateAlleles() ) {
+                final int length = a.length() - vc.getReference().length();
+                if( Math.abs(length) > Math.abs(maxLength) ) { maxLength = length; }
+            }
+            return Integer.toString(maxLength);
+        }});
        getters.put("QUAL", new Getter() { public String get(VariantContext vc) { return Double.toString(vc.getPhredScaledQual()); } });
        getters.put("TRANSITION", new Getter() { public String get(VariantContext vc) {
            if ( vc.isSNP() && vc.isBiallelic() )
@ -304,7 +313,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
        getters.put("FILTER", new Getter() { public String get(VariantContext vc) {
            return vc.isNotFiltered() ? "PASS" : Utils.join(",", vc.getFilters()); }
        });
-
+        getters.put("ID", new Getter() { public String get(VariantContext vc) { return vc.hasID() ? vc.getID() : "."; } });
        getters.put("HET", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHetCount()); } });
        getters.put("HOM-REF", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomRefCount()); } });
        getters.put("HOM-VAR", new Getter() { public String get(VariantContext vc) { return Integer.toString(vc.getHomVarCount()); } });
--- a/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java
+++ b/public/java/src/org/broadinstitute/sting/utils/MendelianViolation.java
@ -26,6 +26,9 @@ public class MendelianViolation {

    double minGenotypeQuality;

+    static final int[] mvOffsets = new int[] { 1,2,5,6,8,11,15,18,20,21,24,25 };
+    static final int[] nonMVOffsets = new int[]{ 0,3,4,7,9,10,12,13,14,16,17,19,22,23,26 };
+
    private static Pattern FAMILY_PATTERN = Pattern.compile("(.*)\\+(.*)=(.*)");

    public String getSampleMom() {
@ -134,4 +137,43 @@ public class MendelianViolation {
            return false;
        return true;
    }
+
+    /**
+     * @return the likelihood ratio for a mendelian violation
+     */
+    public double violationLikelihoodRatio(VariantContext vc) {
+        double[] logLikAssignments = new double[27];
+        // the matrix to set up is
+        // MOM   DAD    CHILD
+        //                    |-  AA
+        //   AA     AA    |    AB
+        //                    |-   BB
+        //                    |- AA
+        //  AA     AB     |   AB
+        //                    |- BB
+        // etc. The leaves are counted as 0-11 for MVs and 0-14 for non-MVs
+        double[] momGL = vc.getGenotype(sampleMom).getLikelihoods().getAsVector();
+        double[] dadGL = vc.getGenotype(sampleDad).getLikelihoods().getAsVector();
+        double[] childGL = vc.getGenotype(sampleChild).getLikelihoods().getAsVector();
+        int offset = 0;
+        for ( int oMom = 0; oMom < 3; oMom++ ) {
+            for ( int oDad = 0; oDad < 3; oDad++ ) {
+                for ( int oChild = 0; oChild < 3; oChild ++ ) {
+                    logLikAssignments[offset++] = momGL[oMom] + dadGL[oDad] + childGL[oChild];
+                }
+            }
+        }
+        double[] mvLiks = new double[12];
+        double[] nonMVLiks = new double[15];
+        for ( int i = 0; i < 12; i ++ ) {
+            mvLiks[i] = logLikAssignments[mvOffsets[i]];
+        }
+
+        for ( int i = 0; i < 15; i++) {
+            nonMVLiks[i] = logLikAssignments[nonMVOffsets[i]];
+        }
+
+        return MathUtils.log10sumLog10(mvLiks) - MathUtils.log10sumLog10(nonMVLiks);
+    }
+
 }
--- a/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java
+++ b/public/java/src/org/broadinstitute/sting/utils/SimpleTimer.java
@ -46,7 +46,7 @@ public class SimpleTimer {
     * @return the name associated with this timer
     */
    @Ensures("result != null")
-    public String getName() {
+    public synchronized String getName() {
        return name;
    }

@ -82,14 +82,14 @@ public class SimpleTimer {
    /**
     * @return is this timer running?
     */
-    public boolean isRunning() {
+    public synchronized boolean isRunning() {
        return running;
    }

    /**
     * @return A convenience function to obtain the current time in milliseconds from this timer
     */
-    public long currentTime() {
+    public synchronized long currentTime() {
        return System.currentTimeMillis();
    }

@ -119,8 +119,4 @@ public class SimpleTimer {
    public synchronized double getElapsedTime() {
        return (running ? (currentTime() - startTime + elapsed) : elapsed) / 1000.0;
    }
-
-    public void printElapsedTime(PrintStream out) {
-        out.printf("SimpleTimer %s: %.2f%n", name, getElapsedTime());
-    }
 }
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
@ -381,7 +381,8 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
            }
        }

-        attributes.put(VariantContext.ID_KEY, id);
+        if ( ! id.equals(VCFConstants.EMPTY_ID_FIELD) )
+            attributes.put(VariantContext.ID_KEY, id);
        return attributes;
    }

--- a/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java
+++ b/public/java/src/org/broadinstitute/sting/utils/exceptions/UserException.java
@ -111,6 +111,10 @@ public class UserException extends ReviewedStingException {
            super(String.format("Couldn't read file because %s caused by %s", message, e.getMessage()));
        }

+        public CouldNotReadInputFile(File file) {
+            super(String.format("Couldn't read file %s", file.getAbsolutePath()));
+        }
+
        public CouldNotReadInputFile(File file, String message) {
            super(String.format("Couldn't read file %s because %s", file.getAbsolutePath(), message));
        }
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/Genotype.java
@ -108,14 +108,19 @@ public class Genotype {
    /**
     * @return the ploidy of this genotype
     */
-    public int getPloidy() { return alleles.size(); }
+    public int getPloidy() {
+        if ( alleles == null )
+            throw new ReviewedStingException("Requesting ploidy for an UNAVAILABLE genotype");
+        return alleles.size();
+    }

    public enum Type {
        NO_CALL,
        HOM_REF,
        HET,
        HOM_VAR,
-        UNAVAILABLE
+        UNAVAILABLE,
+        MIXED  // no-call and call in the same genotype
    }

    public Type getType() {
@ -129,36 +134,68 @@ public class Genotype {
        if ( alleles == null )
            return Type.UNAVAILABLE;

-        Allele firstAllele = alleles.get(0);
+        boolean sawNoCall = false, sawMultipleAlleles = false;
+        Allele observedAllele = null;

-        if ( firstAllele.isNoCall() ) {
-            return Type.NO_CALL;
+        for ( Allele allele : alleles ) {
+            if ( allele.isNoCall() )
+                sawNoCall = true;
+            else if ( observedAllele == null )
+                observedAllele = allele;
+            else if ( !allele.equals(observedAllele) )
+                sawMultipleAlleles = true;
        }

-        for (Allele a : alleles) {
-            if ( ! firstAllele.equals(a) )
-                return Type.HET;
+        if ( sawNoCall ) {
+            if ( observedAllele == null )
+                return Type.NO_CALL;
+            return Type.MIXED;
        }
-        return firstAllele.isReference() ? Type.HOM_REF : Type.HOM_VAR;
+
+        if ( observedAllele == null )
+            throw new ReviewedStingException("BUG: there are no alleles present in this genotype but the alleles list is not null");
+
+        return sawMultipleAlleles ? Type.HET : observedAllele.isReference() ? Type.HOM_REF : Type.HOM_VAR;
    }

    /**
-     * @return true if all observed alleles are the same (regardless of whether they are ref or alt)
+     * @return true if all observed alleles are the same (regardless of whether they are ref or alt); if any alleles are no-calls, this method will return false.
     */
    public boolean isHom()    { return isHomRef() || isHomVar(); }
+
+    /**
+     * @return true if all observed alleles are ref; if any alleles are no-calls, this method will return false.
+     */
    public boolean isHomRef() { return getType() == Type.HOM_REF; }
+
+    /**
+     * @return true if all observed alleles are alt; if any alleles are no-calls, this method will return false.
+     */
    public boolean isHomVar() { return getType() == Type.HOM_VAR; }
    
    /**
-     * @return true if we're het (observed alleles differ)
+     * @return true if we're het (observed alleles differ); if the ploidy is less than 2 or if any alleles are no-calls, this method will return false.
     */
    public boolean isHet() { return getType() == Type.HET; }

    /**
-     * @return true if this genotype is not actually a genotype but a "no call" (e.g. './.' in VCF)
+     * @return true if this genotype is not actually a genotype but a "no call" (e.g. './.' in VCF); if any alleles are not no-calls (even if some are), this method will return false.
     */
    public boolean isNoCall() { return getType() == Type.NO_CALL; }
+
+    /**
+     * @return true if this genotype is comprised of any alleles that are not no-calls (even if some are).
+     */
    public boolean isCalled() { return getType() != Type.NO_CALL && getType() != Type.UNAVAILABLE; }
+
+    /**
+     * @return true if this genotype is comprised of both calls and no-calls.
+     */
+    public boolean isMixed() { return getType() == Type.MIXED; }
+
+    /**
+     * @return true if the type of this genotype is set.
+     */
    public boolean isAvailable() { return getType() != Type.UNAVAILABLE; }

    //
@ -197,14 +234,16 @@ public class Genotype {
        if ( alleles == null ) return;
        if ( alleles.size() == 0) throw new IllegalArgumentException("BUG: alleles cannot be of size 0");

-        int nNoCalls = 0;
+        // int nNoCalls = 0;
        for ( Allele allele : alleles ) {
            if ( allele == null )
                throw new IllegalArgumentException("BUG: allele cannot be null in Genotype");
-            nNoCalls += allele.isNoCall() ? 1 : 0;
+            // nNoCalls += allele.isNoCall() ? 1 : 0;
        }
-        if ( nNoCalls > 0 && nNoCalls != alleles.size() )
-            throw new IllegalArgumentException("BUG: alleles include some No Calls and some Calls, an illegal state " + this);
+
+        // Technically, the spec does allow for the below case so this is not an illegal state
+        //if ( nNoCalls > 0 && nNoCalls != alleles.size() )
+        //    throw new IllegalArgumentException("BUG: alleles include some No Calls and some Calls, an illegal state " + this);
    }

    public String getGenotypeString() {
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableGenotype.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/MutableGenotype.java
@ -40,19 +40,7 @@ public class MutableGenotype extends Genotype {
     */
    public void setAlleles(List<Allele> alleles) {
        this.alleles = new ArrayList<Allele>(alleles);
-
-        // todo -- add validation checking here
-
-        if ( alleles == null ) throw new IllegalArgumentException("BUG: alleles cannot be null in setAlleles");
-        if ( alleles.size() == 0) throw new IllegalArgumentException("BUG: alleles cannot be of size 0 in setAlleles");
-
-        int nNoCalls = 0;
-        for ( Allele allele : alleles ) { nNoCalls += allele.isNoCall() ? 1 : 0; }
-        if ( nNoCalls > 0 && nNoCalls != alleles.size() )
-            throw new IllegalArgumentException("BUG: alleles include some No Calls and some Calls, an illegal state " + this);
-
-        for ( Allele allele : alleles )
-            if ( allele == null ) throw new IllegalArgumentException("BUG: Cannot add a null allele to a genotype");
+        validate();
    }

    public void setPhase(boolean isPhased) {
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
@ -998,7 +998,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati
                else if ( g.isHomVar() )
                    genotypeCounts[Genotype.Type.HOM_VAR.ordinal()]++;
                else
-                    throw new IllegalStateException("Genotype of unknown type: " + g);
+                    genotypeCounts[Genotype.Type.MIXED.ordinal()]++;
            }
        }
    }
@ -1042,6 +1042,15 @@ public class VariantContext implements Feature { // to enable tribble intergrati
        return genotypeCounts[Genotype.Type.HOM_VAR.ordinal()];
    }

+    /**
+     * Genotype-specific functions -- how many mixed calls are there in the genotypes?
+     *
+     * @return number of mixed calls
+     */
+    public int getMixedCount() {
+        return genotypeCounts[Genotype.Type.MIXED.ordinal()];
+    }
+
    // ---------------------------------------------------------------------------------------------------------
    //
    // validation: extra-strict validation routines for paranoid users
@ -1357,10 +1366,10 @@ public class VariantContext implements Feature { // to enable tribble intergrati
                    throw new IllegalArgumentException("Duplicate allele added to VariantContext: " + a);
            }

-           // deal with the case where the first allele isn't the reference
+            // deal with the case where the first allele isn't the reference
            if ( a.isReference() ) {
                if ( sawRef )
-                    throw new IllegalArgumentException("Alleles for a VariantContext must contain a single reference allele: " + alleles);
+                    throw new IllegalArgumentException("Alleles for a VariantContext must contain at most one reference allele: " + alleles);
                alleleList.add(0, a);
                sawRef = true;
            }
@ -1372,7 +1381,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati
            throw new IllegalArgumentException("Cannot create a VariantContext with an empty allele list");

        if ( alleleList.get(0).isNonReference() )
-            throw new IllegalArgumentException("Alleles for a VariantContext must contain a single reference allele: " + alleles);
+            throw new IllegalArgumentException("Alleles for a VariantContext must contain at least one reference allele: " + alleles);

        return alleleList;
    }
--- a/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java
+++ b/public/java/test/org/broadinstitute/sting/utils/variantcontext/VariantContextUnitTest.java
@ -252,6 +252,29 @@ public class VariantContextUnitTest extends BaseTest {
        Assert.assertEquals(vc.getSampleNames().size(), 0);
    }

+    @Test
+    public void testCreatingPartiallyCalledGenotype() {
+        List<Allele> alleles = Arrays.asList(Aref, C);
+        Genotype g = new Genotype("foo", Arrays.asList(C, Allele.NO_CALL), 10);
+        VariantContext vc = new VariantContext("test", snpLoc, snpLocStart, snpLocStop, alleles, Arrays.asList(g));
+
+        Assert.assertTrue(vc.isSNP());
+        Assert.assertEquals(vc.getNAlleles(), 2);
+        Assert.assertTrue(vc.hasGenotypes());
+        Assert.assertFalse(vc.isMonomorphic());
+        Assert.assertTrue(vc.isPolymorphic());
+        Assert.assertEquals(vc.getGenotype("foo"), g);
+        Assert.assertEquals(vc.getChromosomeCount(), 2); // we know that there are 2 chromosomes, even though one isn't called
+        Assert.assertEquals(vc.getChromosomeCount(Aref), 0);
+        Assert.assertEquals(vc.getChromosomeCount(C), 1);
+        Assert.assertFalse(vc.getGenotype("foo").isHet());
+        Assert.assertFalse(vc.getGenotype("foo").isHom());
+        Assert.assertFalse(vc.getGenotype("foo").isNoCall());
+        Assert.assertFalse(vc.getGenotype("foo").isHom());
+        Assert.assertTrue(vc.getGenotype("foo").isMixed());
+        Assert.assertEquals(vc.getGenotype("foo").getType(), Genotype.Type.MIXED);
+    }
+
    @Test (expectedExceptions = IllegalArgumentException.class)
    public void testBadConstructorArgs1() {
        new VariantContext("test", insLoc, insLocStart, insLocStop, Arrays.asList(delRef, ATCref));
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala
@ -12,6 +12,7 @@ import net.sf.samtools.SAMFileHeader.SortOrder

 import org.broadinstitute.sting.queue.util.QScriptUtils
 import org.broadinstitute.sting.queue.function.ListWriterFunction
+import org.broadinstitute.sting.commandline.Hidden

 class DataProcessingPipeline extends QScript {
  qscript =>
@ -64,6 +65,9 @@ class DataProcessingPipeline extends QScript {
  @Input(doc="Decompose input BAM file and fully realign it using BWA and assume Pair Ended reads", fullName="use_bwa_pair_ended", shortName="bwape", required=false)
  var useBWApe: Boolean = false

+  @Input(doc="Decompose input BAM file and fully realign it using BWA SW", fullName="use_bwa_sw", shortName="bwasw", required=false)
+  var useBWAsw: Boolean = false
+
  @Input(doc="Number of threads BWA should use", fullName="bwa_threads", shortName="bt", required=false)
  var bwaThreads: Int = 1

@ -71,12 +75,24 @@ class DataProcessingPipeline extends QScript {
  var noValidation: Boolean = false


+  /****************************************************************************
+  * Hidden Parameters
+  ****************************************************************************/
+  @Hidden
+  @Input(doc="How many ways to scatter/gather", fullName="scatter_gather", shortName="sg", required=false)
+  var nContigs: Int = -1
+
+  @Hidden
+  @Input(doc="Define the default platform for Count Covariates -- useful for techdev purposes only.", fullName="default_platform", shortName="dp", required=false)
+  var defaultPlatform: String = ""
+
+
  /****************************************************************************
  * Global Variables
  ****************************************************************************/

  val queueLogDir: String = ".qlog/"  // Gracefully hide Queue's output
-  var nContigs: Int = 0               // Use the number of contigs for scatter gathering jobs
+
  var cleanModelEnum: ConsensusDeterminationModel = ConsensusDeterminationModel.USE_READS


@ -149,22 +165,28 @@ class DataProcessingPipeline extends QScript {
    var index = 1
    for (bam <- bams) {
      // first revert the BAM file to the original qualities
-      val revertedBAM = revertBAM(bam)
-      val readSortedBam = swapExt(revertedBAM, ".bam", "." + index + ".sorted.bam" )
      val saiFile1 = swapExt(bam, ".bam", "." + index + ".1.sai")
      val saiFile2 = swapExt(bam, ".bam", "." + index + ".2.sai")
      val realignedSamFile = swapExt(bam, ".bam", "." + index + ".realigned.sam")
      val realignedBamFile = swapExt(bam, ".bam", "." + index + ".realigned.bam")
      val rgRealignedBamFile = swapExt(bam, ".bam", "." + index + ".realigned.rg.bam")
+
      if (useBWAse) {
+        val revertedBAM = revertBAM(bam, true)
        add(bwa_aln_se(revertedBAM, saiFile1),
            bwa_sam_se(revertedBAM, saiFile1, realignedSamFile))
      }
-      else {
-        add(sortSam(revertedBAM, readSortedBam, SortOrder.queryname),
-            bwa_aln_pe(readSortedBam, saiFile1, 1),
-            bwa_aln_pe(readSortedBam, saiFile2, 2),
-            bwa_sam_pe(readSortedBam, saiFile1, saiFile2, realignedSamFile))
+      else if (useBWApe) {
+        val revertedBAM = revertBAM(bam, true)
+        add(bwa_aln_pe(revertedBAM, saiFile1, 1),
+            bwa_aln_pe(revertedBAM, saiFile2, 2),
+            bwa_sam_pe(revertedBAM, saiFile1, saiFile2, realignedSamFile))
+      }
+      else if (useBWAsw) {
+        val revertedBAM = revertBAM(bam, false)
+        val fastQ = swapExt(revertedBAM, ".bam", ".fq")
+        add(convertToFastQ(revertedBAM, fastQ),
+            bwa_sw(fastQ, realignedSamFile))
      }
      add(sortSam(realignedSamFile, realignedBamFile, SortOrder.coordinate))
      addReadGroups(realignedBamFile, rgRealignedBamFile, new SAMFileReader(bam))
@ -183,16 +205,16 @@ class DataProcessingPipeline extends QScript {
      ConsensusDeterminationModel.USE_READS
  }

-  def revertBams(bams: List[File]): List[File] = {
+  def revertBams(bams: List[File], removeAlignmentInformation: Boolean): List[File] = {
    var revertedBAMList: List[File] = List()
    for (bam <- bams)
-      revertedBAMList :+= revertBAM(bam)
+      revertedBAMList :+= revertBAM(bam, removeAlignmentInformation)
    return revertedBAMList
  }

-  def revertBAM(bam: File): File = {
+  def revertBAM(bam: File, removeAlignmentInformation: Boolean): File = {
    val revertedBAM = swapExt(bam, ".bam", ".reverted.bam")
-    add(revert(bam, revertedBAM))
+    add(revert(bam, revertedBAM, removeAlignmentInformation))
    return revertedBAM
  }

@ -210,9 +232,10 @@ class DataProcessingPipeline extends QScript {

    // keep a record of the number of contigs in the first bam file in the list
    val bams = QScriptUtils.createListFromFile(input)
-    nContigs = QScriptUtils.getNumberOfContigs(bams(0))
+    if (nContigs < 0)
+     nContigs = QScriptUtils.getNumberOfContigs(bams(0))

-    val realignedBAMs = if (useBWApe || useBWAse) {performAlignment(bams)} else {revertBams(bams)}
+    val realignedBAMs = if (useBWApe || useBWAse  || useBWAsw) {performAlignment(bams)} else {revertBams(bams, false)}

    // generate a BAM file per sample joining all per lane files if necessary
    val sampleBAMFiles: Map[String, List[File]] = createSampleFiles(bams, realignedBAMs)
@ -325,6 +348,7 @@ class DataProcessingPipeline extends QScript {
    this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate")
    this.input_file :+= inBam
    this.recal_file = outRecalFile
+    if (!defaultPlatform.isEmpty) this.default_platform = defaultPlatform
    if (!qscript.intervalString.isEmpty()) this.intervalsString ++= List(qscript.intervalString)
    else if (qscript.intervals != null) this.intervals :+= qscript.intervals
    this.scatterCount = nContigs
@ -408,12 +432,20 @@ class DataProcessingPipeline extends QScript {
    this.jobName = queueLogDir + outBam + ".rg"
  }

-  case class revert (inBam: File, outBam: File) extends RevertSam with ExternalCommonArgs {
+  case class revert (inBam: File, outBam: File, removeAlignmentInfo: Boolean) extends RevertSam with ExternalCommonArgs {
    this.output = outBam
    this.input :+= inBam
+    this.removeAlignmentInformation = removeAlignmentInfo;
+    this.sortOrder = if (removeAlignmentInfo) {SortOrder.queryname} else {SortOrder.coordinate}
    this.analysisName = queueLogDir + outBam + "revert"
    this.jobName = queueLogDir + outBam + ".revert"
+  }

+  case class convertToFastQ (inBam: File, outFQ: File) extends SamToFastq with ExternalCommonArgs {
+    this.input :+= inBam
+    this.fastq = outFQ
+    this.analysisName = queueLogDir + outFQ + "convert_to_fastq"
+    this.jobName = queueLogDir + outFQ + ".convert_to_fastq"
  }

  case class bwa_aln_se (inBam: File, outSai: File) extends CommandLineFunction with ExternalCommonArgs {
@ -453,6 +485,14 @@ class DataProcessingPipeline extends QScript {
    this.jobName = queueLogDir + outBam + ".bwa_sam_pe"
  }

+  case class bwa_sw (inFastQ: File, outBam: File) extends CommandLineFunction with ExternalCommonArgs {
+    @Input(doc="fastq file to be aligned") var fq = inFastQ
+    @Output(doc="output bam file") var bam = outBam
+    def commandLine = bwaPath + " bwasw -t " + bwaThreads + " " + reference + " " + fq + " > " + bam
+    this.analysisName = queueLogDir + outBam + ".bwasw"
+    this.jobName = queueLogDir + outBam + ".bwasw"
+  }
+
  case class writeList(inBams: List[File], outBamList: File) extends ListWriterFunction {
    this.inputFiles = inBams
    this.listFile = outBamList
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/MethodsDevelopmentCallingPipeline.scala
@ -98,13 +98,17 @@ class MethodsDevelopmentCallingPipeline extends QScript {
  // BUGBUG: We no longer support b36/hg18 because several of the necessary files aren't available aligned to those references

  val targetDataSets: Map[String, Target] = Map(
+    "NA12878_gold" -> new Target("NA12878.goldStandard", hg19, dbSNP_b37, hapmap_b37, indelMask_b37,
+              new File("/humgen/gsa-hpprojects/dev/carneiro/NA12878/data/goldStandard.list"),
+              new File("/humgen/gsa-hpprojects/dev/carneiro/NA12878/analysis/snps/NA12878.HiSeq19.filtered.vcf"),          // ** There is no gold standard for the gold standard **
+              "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.noChrY.hg19.intervals", 2.14, 99.0, lowPass, !exome, 391),
    "NA12878_wgs_b37" -> new Target("NA12878.HiSeq.WGS.b37", hg19, dbSNP_b37, hapmap_b37, indelMask_b37,
              new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.hg19.bam"),
-              new File("/humgen/gsa-hpprojects/dev/carneiro/hiseq19/analysis/snps/NA12878.HiSeq19.filtered.vcf"),
+              new File("/humgen/gsa-hpprojects/dev/carneiro/NA12878/analysis/snps/NA12878.HiSeq19.filtered.vcf"),
              "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.noChrY.hg19.intervals", 2.14, 99.0, !lowPass, !exome, 1),
    "NA12878_wgs_decoy" -> new Target("NA12878.HiSeq.WGS.b37_decoy", b37_decoy, dbSNP_b37, hapmap_b37, indelMask_b37,
              new File("/humgen/gsa-hpprojects/NA12878Collection/bams/CEUTrio.HiSeq.WGS.b37_decoy.NA12878.clean.dedup.recal.bam"),
-              new File("/humgen/gsa-hpprojects/dev/carneiro/hiseq19/analysis/snps/NA12878.HiSeq19.filtered.vcf"),          // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
+              new File("/humgen/gsa-hpprojects/dev/carneiro/NA12878/analysis/snps/NA12878.HiSeq19.filtered.vcf"),          // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
              "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.noChrY.hg19.intervals", 2.14, 99.0, !lowPass, !exome, 1),
    "NA12878_wgs_hg18" -> new Target("NA12878.HiSeq.WGS.hg18", hg18, dbSNP_hg18_129, hapmap_hg18,
              "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.indels.10.mask",
@ -142,7 +146,7 @@ class MethodsDevelopmentCallingPipeline extends QScript {
              "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.3, 99.0, !lowPass, !exome, 3),
    "GA2hg19" -> new Target("NA12878.GA2.hg19", hg19, dbSNP_b37, hapmap_b37, indelMask_b37,
              new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.GA2.WGS.bwa.cleaned.hg19.bam"),
-              new File("/humgen/gsa-hpprojects/dev/carneiro/hiseq19/analysis/snps/NA12878.GA2.hg19.filtered.vcf"),
+              new File("/humgen/gsa-hpprojects/dev/carneiro/NA12878/analysis/snps/NA12878.GA2.hg19.filtered.vcf"),
              "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.hg19.intervals", 2.14, 99.0, !lowPass, !exome, 1),
    "FIN" -> new Target("FIN", b37, dbSNP_b37, hapmap_b37, indelMask_b37,
              new File("/humgen/1kg/processing/pipeline_test_bams/FIN.79sample.Nov2010.chr20.bam"),
--- a/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala
+++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/gatk/VcfGatherFunction.scala
@ -36,8 +36,6 @@ class VcfGatherFunction extends CombineVariants with GatherFunction {
  private lazy val originalGATK = this.originalFunction.asInstanceOf[CommandLineGATK]

  override def freezeFieldValues {
-    this.memoryLimit = Some(1)
-
    this.jarFile = this.originalGATK.jarFile
    this.reference_sequence = this.originalGATK.reference_sequence
    this.intervals = this.originalGATK.intervals
--- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardBamFunction.scala
+++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/PicardBamFunction.scala
@ -50,8 +50,8 @@ trait PicardBamFunction extends JavaCommandLineFunction {
  abstract override def commandLine = super.commandLine +
    Array(
      repeat(" INPUT=", inputBams),
-      " OUTPUT=" + outputBam,
      " TMP_DIR=" + jobTempDir,
+      optional(" OUTPUT=", outputBam),
      optional(" COMPRESSION_LEVEL=", compressionLevel),
      optional(" VALIDATION_STRINGENCY=", validationStringency),
      optional(" SO=", sortOrder),
--- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SamToFastq.scala
+++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/SamToFastq.scala
@ -0,0 +1,77 @@
+package org.broadinstitute.sting.queue.extensions.picard
+
+import org.broadinstitute.sting.commandline._
+
+import java.io.File
+
+/*
+ * Created by IntelliJ IDEA.
+ * User: carneiro
+ * Date: 6/22/11
+ * Time: 10:35 AM
+ */
+class SamToFastq extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardBamFunction {
+  analysisName = "SamToFastq"
+  javaMainClass = "net.sf.picard.sam.SamToFastq"
+
+  @Input(shortName = "input", fullName = "input_bam_files", required = true, doc = "Input SAM/BAM file to extract reads from.")
+  var input: List[File] = Nil
+
+  @Output(shortName = "fastq", fullName = "output_fastq_file", required = true, doc = "Output fastq file (single-end fastq or, if paired, first end of the pair fastq).")
+  var fastq: File = _
+
+  @Output(shortName = "se", fullName = "second_end_fastq", required = false, doc = "Output fastq file (if paired, second end of the pair fastq).")
+  var secondEndFastQ: File = _
+
+  @Argument(shortName = "opg", fullName = "output_per_readgroup", required = false, doc = "Output a fastq file per read group (two fastq files per read group if the group is paired).")
+  var outputPerReadGroup: Boolean = false
+
+  @Argument(shortName = "od", fullName = "output_dir", required = false, doc = "Directory in which to output the fastq file(s). Used only when OUTPUT_PER_RG is true.")
+  var outputDir: File = _
+
+  @Argument(shortName = "rr", fullName = "re_reverse", required = false, doc = "Re-reverse bases and qualities of reads with negative strand flag set before writing them to fastq.")
+  var reReverse: Boolean = true
+
+  @Argument(shortName = "nonpf", fullName = "include_non_pf_reads", required = false, doc = "Include non-PF reads from the SAM file into the output FASTQ files.")
+  var includeNonPFReads: Boolean = false
+
+  @Argument(shortName = "cat", fullName = "clipping_attribute", required = false, doc = "The attribute that stores the position at which the SAM record should be clipped.")
+  var clippingAttribute: String = null
+
+  @Argument(shortName = "cac", fullName = "clipping_action", required = false, doc = "The action that should be taken with clipped reads: 'X' means the reads and qualities should be trimmed at the clipped position; 'N' means the bases should be changed to Ns in the clipped region; and any integer means that the base qualities should be set to that value in the clipped region.")
+  var clippingAction: String = null
+
+  @Argument(shortName = "r1t", fullName = "read_one_trim", required = false, doc = "The number of bases to trim from the beginning of read 1.")
+  var readOneTrim: Int = -1
+
+  @Argument(shortName = "r1mbtw", fullName = "read_one_max_bases_to_write", required = false, doc = "The maximum number of bases to write from read 1 after trimming. If there are fewer than this many bases left after trimming, all will be written. If this value is null then all bases left after trimming will be written.")
+  var readOneMaxBasesToWrite: Int = -1
+
+  @Argument(shortName = "r2t", fullName = "read_two_trim", required = false, doc = "The number of bases to trim from the beginning of read 2.")
+  var readTwoTrim: Int = -1
+
+  @Argument(shortName = "r2mbtw", fullName = "read_two_max_bases_to_write", required = false, doc = "The maximum number of bases to write from read 2 after trimming. If there are fewer than this many bases left after trimming, all will be written. If this value is null then all bases left after trimming will be written.")
+  var readTwoMaxBasesToWrite: Int = -1
+
+  @Argument(shortName = "inpa", fullName = "include_non_primary_alignments", required = false, doc = "If true, include non-primary alignments in the output. Support of non-primary alignments in SamToFastq is not comprehensive, so there may be exceptions if this is set to true and there are paired reads with non-primary alignments.")
+  var includeNonPrimaryAlignments: Boolean = false
+
+  override def inputBams = input
+  override def outputBam = null
+  this.sortOrder = null
+
+  override def commandLine = super.commandLine +
+       " FASTQ=" + fastq +
+       optional(" SECOND_END_FASTQ=", secondEndFastQ) +
+       conditionalParameter(outputPerReadGroup, optional(" OUTPUT_PER_RG=", outputPerReadGroup)) +
+       optional(" OUTPUT_DIR=", outputDir) +
+       conditionalParameter(!reReverse, optional(" RE_REVERSE=", reReverse)) +
+       conditionalParameter(includeNonPFReads, optional(" INCLUDE_NON_PF_READS=", includeNonPFReads)) +
+       optional(" CLIPPING_ATTRIBUTE=", clippingAttribute) +
+       optional(" CLIPPING_ACTION=", clippingAction) +
+       conditionalParameter (readOneTrim >= 0, optional(" READ1_TRIM=", readOneTrim)) +
+       conditionalParameter (readOneMaxBasesToWrite >= 0, optional(" READ1_MAX_BASES_TO_WRITE=", readOneMaxBasesToWrite)) +
+       conditionalParameter (readTwoTrim >= 0, optional(" READ2_TRIM=", readTwoTrim)) +
+       conditionalParameter (readTwoMaxBasesToWrite >=0, optional(" READ2_MAX_BASES_TO_WRITE=", readTwoMaxBasesToWrite)) +
+       conditionalParameter (includeNonPrimaryAlignments, optional(" INCLUDE_NON_PRIMARY_ALIGNMENTS=", includeNonPrimaryAlignments))
+}