Merge branch 'master' into rodTesting

Conflicts: public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java
2011-08-26 00:35:41 -04:00 · 2011-08-26 00:35:41 -04:00 · eef1ac415a
parent 6bd3959ff2 c0e55c5ca4
commit eef1ac415a
12 changed files with 368 additions and 200 deletions
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/genotyper/UnifiedArgumentCollection.java
@ -156,7 +156,7 @@ public class UnifiedArgumentCollection {
    public boolean OUTPUT_DEBUG_INDEL_INFO = false;

    @Hidden
-    @Argument(fullName = "dovit", shortName = "dovit", doc = "Output indel debug info", required = false)
+    @Argument(fullName = "dovit", shortName = "dovit", doc = "Perform full Viterbi calculation when evaluating the HMM", required = false)
    public boolean dovit = false;

    @Hidden
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/indels/PairHMMIndelErrorModel.java
@ -274,7 +274,7 @@ public class PairHMMIndelErrorModel {
        this.doViterbi = dovit;
    }

-    public  PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean doCDP) {
+    public PairHMMIndelErrorModel(double indelGOP, double indelGCP, boolean deb, boolean doCDP) {


        this.logGapOpenProbability = -indelGOP/10.0; // QUAL to log prob
@ -754,7 +754,7 @@ public class PairHMMIndelErrorModel {

            // check if we've already computed likelihoods for this pileup element (i.e. for this read at this location)
            if (indelLikelihoodMap.containsKey(p)) {
-                HashMap<Allele,Double> el =  indelLikelihoodMap.get(p);
+                HashMap<Allele,Double> el = indelLikelihoodMap.get(p);
                int j=0;
                for (Allele a: haplotypeMap.keySet()) {
                    readLikelihoods[readIdx][j++] = el.get(a);
@ -1055,7 +1055,6 @@ public class PairHMMIndelErrorModel {
            genotypeLikelihoods[i] -= maxElement;

        return genotypeLikelihoods;
-
    }

    /**
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariants.java
@ -24,6 +24,7 @@

 package org.broadinstitute.sting.gatk.walkers.variantutils;

+import org.apache.poi.hpsf.Variant;
 import org.broadinstitute.sting.commandline.*;
 import org.broadinstitute.sting.gatk.arguments.StandardVariantContextInputArgumentCollection;
 import org.broadinstitute.sting.utils.MathUtils;
@ -165,6 +166,23 @@ import java.util.*;
 *   -o output.vcf \
 *   -fraction 0.5
 *
+ * Select only indels from a VCF:
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T SelectVariants \
+ *   --variant input.vcf \
+ *   -o output.vcf \
+ *   -selectType INDEL
+ *
+ * Select only multi-allelic SNPs and MNPs from a VCF (i.e. SNPs with more than one allele listed in the ALT column):
+ * java -Xmx2g -jar GenomeAnalysisTK.jar \
+ *   -R ref.fasta \
+ *   -T SelectVariants \
+ *   --variant input.vcf \
+ *   -o output.vcf \
+ *   -selectType SNP -selectType MNP \
+ *   -restrictAllelesTo MULTIALLELIC
+ *
 * </pre>
 *
 */
@ -223,6 +241,18 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
    @Argument(fullName="excludeFiltered", shortName="ef", doc="Don't include filtered loci in the analysis", required=false)
    private boolean EXCLUDE_FILTERED = false;

+
+    /**
+     * When this argument is used, we can choose to include only multiallelic or biallelic sites, depending on how many alleles are listed in the ALT column of a vcf.
+     * For example, a multiallelic record such as:
+     * 1    100 .   A   AAA,AAAAA
+     * will be excluded if "-restrictAllelesTo BIALLELIC" is included, because there are two alternate alleles, whereas a record such as:
+     * 1    100 .   A  T
+     * will be included in that case, but would be excluded if "-restrictAllelesTo MULTIALLELIC
+     */
+    @Argument(fullName="restrictAllelesTo", shortName="restrictAllelesTo", doc="Select only variants of a particular allelicity. Valid options are ALL (default), MULTIALLELIC or BIALLELIC", required=false)
+    private  NumberAlleleRestriction alleleRestriction = NumberAlleleRestriction.ALL;
+
    @Argument(fullName="keepOriginalAC", shortName="keepOriginalAC", doc="Don't update the AC, AF, or AN values in the INFO field after selecting", required=false)
    private boolean KEEP_ORIGINAL_CHR_COUNTS = false;

@ -266,11 +296,14 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
    @Argument(fullName="select_random_fraction", shortName="fraction", doc="Selects a fraction (a number between 0 and 1) of the total variants at random from the variant track", required=false)
    private double fractionRandom = 0;

-    @Argument(fullName="selectSNPs", shortName="snps", doc="Select only SNPs from the input file", required=false)
-    private boolean SELECT_SNPS = false;
+    /**
+      * This argument select particular kinds of variants out of a list. If left empty, there is no type selection and all variant types are considered for other selection criteria.
+     * When specified one or more times, a particular type of variant is selected.
+     *
+      */
+    @Argument(fullName="selectTypeToInclude", shortName="selectType", doc="Select only a certain type of variants from the input file. Valid types are INDEL, SNP, MIXED, MNP, SYMBOLIC, NO_VARIATION. Can be specified multiple times", required=false)
+    private List<VariantContext.Type> TYPES_TO_INCLUDE = new ArrayList<VariantContext.Type>();

-    @Argument(fullName="selectIndels", shortName="indels", doc="Select only indels from the input file", required=false)
-    private boolean SELECT_INDELS = false;

    @Hidden
    @Argument(fullName="outMVFile", shortName="outMVFile", doc="USE YAML FILE INSTEAD (-SM) !!! string formatted as dad+mom=child where these parameters determine which sample names are examined", required=false)
@ -290,6 +323,13 @@ public class SelectVariants extends RodWalker<Integer, Integer> {

    }

+    public enum NumberAlleleRestriction {
+           ALL,
+           BIALLELIC,
+           MULTIALLELIC
+    }
+
+    private ArrayList<VariantContext.Type> selectedTypes = new ArrayList<VariantContext.Type>();
    private ArrayList<String> selectNames = new ArrayList<String>();
    private List<VariantContextUtils.JexlVCMatchExp> jexls = null;

@ -354,6 +394,20 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
        for ( String sample : samples )
            logger.info("Including sample '" + sample + "'");

+
+
+        // if user specified types to include, add these, otherwise, add all possible variant context types to list of vc types to include
+        if (TYPES_TO_INCLUDE.isEmpty()) {
+
+            for (VariantContext.Type t : VariantContext.Type.values())
+                selectedTypes.add(t);
+
+        }
+        else {
+            for (VariantContext.Type t : TYPES_TO_INCLUDE)
+                selectedTypes.add(t);
+
+        }
        // Initialize VCF header
        Set<VCFHeaderLine> headerLines = VCFUtils.smartMergeHeaders(vcfRods.values(), logger);
        headerLines.add(new VCFHeaderLine("source", "SelectVariants"));
@ -494,12 +548,13 @@ public class SelectVariants extends RodWalker<Integer, Integer> {
                    return 0;
            }

-            // TODO - add ability to also select MNPs
-            // TODO - move variant selection arguments to the engine so other walkers can also do this
-            if (SELECT_INDELS && !(vc.isIndel() || vc.isMixed()))
+            if (alleleRestriction.equals(NumberAlleleRestriction.BIALLELIC) && !vc.isBiallelic())
                continue;

-            if (SELECT_SNPS && !vc.isSNP())
+            if (alleleRestriction.equals(NumberAlleleRestriction.MULTIALLELIC) && vc.isBiallelic())
+                continue;
+
+            if (!selectedTypes.contains(vc.getType()))
                continue;

            VariantContext sub = subsetRecord(vc, samples);
--- a/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java
+++ b/public/java/src/org/broadinstitute/sting/gatk/walkers/variantutils/VariantsToTable.java
@ -124,8 +124,12 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
     * multi-allelic INFO field values can be lists of values.
     */
    @Advanced
-    @Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false)
-    public boolean keepMultiAllelic = false;
+     @Argument(fullName="keepMultiAllelic", shortName="KMA", doc="If provided, we will not require the site to be biallelic", required=false)
+     public boolean keepMultiAllelic = false;
+
+    @Hidden
+    @Argument(fullName="logACSum", shortName="logACSum", doc="Log sum of AC instead of max value in case of multiallelic variants", required=false)
+     public boolean logACSum = false;

    /**
     * By default, this tool throws a UserException when it encounters a field without a value in some record.  This
@ -147,14 +151,13 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
        if ( tracker == null ) // RodWalkers can make funky map calls
            return 0;

-        nRecords++;
        for ( VariantContext vc : tracker.getValues(variantCollection.variants, context.getLocation())) {
            if ( (keepMultiAllelic || vc.isBiallelic()) && ( showFiltered || vc.isNotFiltered() ) ) {
-                List<String> vals = extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA);
+                List<String> vals = extractFields(vc, fieldsToTake, ALLOW_MISSING_DATA, keepMultiAllelic, logACSum);
                out.println(Utils.join("\t", vals));
            }
        }
-
+        
        return 1;
    }

@ -176,9 +179,11 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
     * @param fields a non-null list of fields to capture from VC
     * @param allowMissingData if false, then throws a UserException if any field isn't found in vc.  Otherwise
     *   provides a value of NA
+     *   @param kma if true, multiallelic variants are to be kept
+     *   @param logsum if true, AF and AC are computed based on sum of allele counts. Otherwise, based on allele with highest count.
     * @return
     */
-    public static List<String> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData) {
+    private static List<String> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData, boolean kma, boolean logsum) {
        List<String> vals = new ArrayList<String>();

        for ( String field : fields ) {
@ -206,12 +211,31 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
            }

            if (field.equals("AF") || field.equals("AC")) {
-                      if (val.contains(",")) {
-                         // strip [,] and spaces
-                         val = val.replace("[","");
-                         val = val.replace("]","");
-                         val = val.replace(" ","");
-                        }
+                     String afo = val;
+
+                     double af=0;
+                     if (afo.contains(",")) {
+                         String[] afs = afo.split(",");
+                         afs[0] = afs[0].substring(1,afs[0].length());
+                         afs[afs.length-1] = afs[afs.length-1].substring(0,afs[afs.length-1].length()-1);
+
+                         double[] afd = new double[afs.length];
+
+                         for (int k=0; k < afd.length; k++)
+                             afd[k] = Double.valueOf(afs[k]);
+
+                         if (kma && logsum)
+                             af = MathUtils.sum(afd);
+                         else
+                         af = MathUtils.arrayMax(afd);
+                         //af = Double.valueOf(afs[0]);
+
+                     }
+                     else
+                         if (!afo.equals("NA"))
+                             af = Double.valueOf(afo);
+
+                val = Double.toString(af);

            }
            vals.add(val);
@ -220,6 +244,9 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
        return vals;
    }

+    public static List<String> extractFields(VariantContext vc, List<String> fields, boolean allowMissingData) {
+        return extractFields(vc, fields, allowMissingData, false, false);
+    }
    //
    // default reduce -- doesn't do anything at all
    //
@ -243,7 +270,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
        getters.put("REF", new Getter() {
            public String get(VariantContext vc) {
                String x = "";
-                if ( vc.hasReferenceBaseForIndel() ) {
+                if ( vc.hasReferenceBaseForIndel() && !vc.isSNP() ) {
                    Byte refByte = vc.getReferenceBaseForIndel();
                    x=x+new String(new byte[]{refByte});
                }
@ -255,7 +282,7 @@ public class VariantsToTable extends RodWalker<Integer, Integer> {
                StringBuilder x = new StringBuilder();
                int n = vc.getAlternateAlleles().size();
                if ( n == 0 ) return ".";
-                if ( vc.hasReferenceBaseForIndel() ) {
+                if ( vc.hasReferenceBaseForIndel() && !vc.isSNP() ) {
                    Byte refByte = vc.getReferenceBaseForIndel();
                    x.append(new String(new byte[]{refByte}));
                }
--- a/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
+++ b/public/java/src/org/broadinstitute/sting/utils/codecs/vcf/AbstractVCFCodec.java
@ -281,7 +281,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,

        VariantContext vc = null;
        try {
-            vc =  new VariantContext(name, contig, pos, loc, alleles, qual, filters, attributes);
+            vc =  new VariantContext(name, contig, pos, loc, alleles, qual, filters, attributes, ref.getBytes()[0]);
        } catch (Exception e) {
            generateException(e.getMessage());
        }
@ -290,8 +290,7 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
        if ( !header.samplesWereAlreadySorted() )
            vc.getGenotypes();

-        // Trim bases of all alleles if necessary
-        return createVariantContextWithTrimmedAlleles(vc);
+        return vc;
    }

    /**
@ -516,25 +515,44 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
        return true;
    }

-    private static int computeForwardClipping(List<Allele> unclippedAlleles, String ref) {
+    public static int computeForwardClipping(List<Allele> unclippedAlleles, String ref) {
        boolean clipping = true;
-        // Note that the computation of forward clipping here is meant only to see whether there is a common
-        // base to all alleles, and to correctly compute reverse clipping,
-        // but it is not used for actually changing alleles - this is done in function
-        // createVariantContextWithTrimmedAlleles() below.

-        for (Allele a : unclippedAlleles) {
-            if (a.isSymbolic()) {
+        for ( Allele a : unclippedAlleles ) {
+            if ( a.isSymbolic() )
                continue;
-            }
-            if (a.length() < 1 || (a.getBases()[0] != ref.getBytes()[0])) {
+
+            if ( a.length() < 1 || (a.getBases()[0] != ref.getBytes()[0]) ) {
                clipping = false;
+                break;
            }
        }
-        return (clipping) ? 1 : 0;

+        return (clipping) ? 1 : 0;
    }

+    protected static int computeReverseClipping(List<Allele> unclippedAlleles, String ref, int forwardClipping, int lineNo) {
+        int clipping = 0;
+        boolean stillClipping = true;
+
+        while ( stillClipping ) {
+            for ( Allele a : unclippedAlleles ) {
+                if ( a.isSymbolic() )
+                    continue;
+
+                if ( a.length() - clipping <= forwardClipping || a.length() - forwardClipping == 0 )
+                    stillClipping = false;
+                else if ( ref.length() == clipping )
+                    generateException("bad alleles encountered", lineNo);
+                else if ( a.getBases()[a.length()-clipping-1] != ref.getBytes()[ref.length()-clipping-1] )
+                    stillClipping = false;
+            }
+            if ( stillClipping )
+                clipping++;
+        }
+
+        return clipping;
+    }
    /**
     * clip the alleles, based on the reference
     *
@ -542,122 +560,30 @@ public abstract class AbstractVCFCodec implements FeatureCodec, NameAwareCodec,
     * @param ref the reference string
     * @param unclippedAlleles the list of unclipped alleles
     * @param clippedAlleles output list of clipped alleles
+     * @param lineNo the current line number in the file
     * @return the new reference end position of this event
     */
    protected static int clipAlleles(int position, String ref, List<Allele> unclippedAlleles, List<Allele> clippedAlleles, int lineNo) {

-        // Note that the computation of forward clipping here is meant only to see whether there is a common
-        // base to all alleles, and to correctly compute reverse clipping,
-        // but it is not used for actually changing alleles - this is done in function
-        // createVariantContextWithTrimmedAlleles() below.
-
        int forwardClipping = computeForwardClipping(unclippedAlleles, ref);
-
-        int reverseClipped = 0;
-        boolean clipping = true;
-        while (clipping) {
-            for (Allele a : unclippedAlleles) {
-                if (a.isSymbolic()) {
-                    continue;
-                }
-                if (a.length() - reverseClipped <= forwardClipping || a.length() - forwardClipping == 0)
-                    clipping = false;
-                else if (ref.length() == reverseClipped)
-                    generateException("bad alleles encountered", lineNo);
-                else if (a.getBases()[a.length()-reverseClipped-1] != ref.getBytes()[ref.length()-reverseClipped-1])
-                    clipping = false;
-            }
-            if (clipping) reverseClipped++;
-        }
+        int reverseClipping = computeReverseClipping(unclippedAlleles, ref, forwardClipping, lineNo);

        if ( clippedAlleles != null ) {
            for ( Allele a : unclippedAlleles ) {
                if ( a.isSymbolic() ) {
                    clippedAlleles.add(a);
                } else {
-                    clippedAlleles.add(Allele.create(Arrays.copyOfRange(a.getBases(),0,a.getBases().length-reverseClipped),a.isReference()));
+                    clippedAlleles.add(Allele.create(Arrays.copyOfRange(a.getBases(), forwardClipping, a.getBases().length-reverseClipping), a.isReference()));
                }
            }
        }

        // the new reference length
-        int refLength = ref.length() - reverseClipped;
+        int refLength = ref.length() - reverseClipping;

        return position+Math.max(refLength - 1,0);
    }

-    public static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) {
-        // see if we need to trim common reference base from all alleles
-        boolean trimVC;
-
-        // We need to trim common reference base from all alleles in all genotypes if a ref base is common to all alleles
-        Allele refAllele = inputVC.getReference();
-        if (!inputVC.isVariant())
-            trimVC = false;
-        else if (refAllele.isNull())
-            trimVC = false;
-        else {
-            trimVC = (computeForwardClipping(new ArrayList<Allele>(inputVC.getAlternateAlleles()),
-                    inputVC.getReference().getDisplayString()) > 0);
-         }
-
-        // nothing to do if we don't need to trim bases
-        if (trimVC) {
-            List<Allele> alleles = new ArrayList<Allele>();
-            Map<String, Genotype> genotypes = new TreeMap<String, Genotype>();
-
-            // set the reference base for indels in the attributes
-            Map<String,Object> attributes = new TreeMap<String,Object>(inputVC.getAttributes());
-
-            Map<Allele, Allele> originalToTrimmedAlleleMap = new HashMap<Allele, Allele>();
-
-            for (Allele a : inputVC.getAlleles()) {
-                if (a.isSymbolic()) {
-                    alleles.add(a);
-                    originalToTrimmedAlleleMap.put(a, a);
-                } else {
-                    // get bases for current allele and create a new one with trimmed bases
-                    byte[] newBases = Arrays.copyOfRange(a.getBases(), 1, a.length());
-                    Allele trimmedAllele = Allele.create(newBases, a.isReference());
-                    alleles.add(trimmedAllele);
-                    originalToTrimmedAlleleMap.put(a, trimmedAllele);
-                }
-            }
-
-            // detect case where we're trimming bases but resulting vc doesn't have any null allele. In that case, we keep original representation
-            // example: mixed records such as {TA*,TGA,TG}
-            boolean hasNullAlleles = false;
-
-            for (Allele a: originalToTrimmedAlleleMap.values()) {
-                if (a.isNull())
-                    hasNullAlleles = true;
-                if (a.isReference())
-                    refAllele = a;
-             }
-
-             if (!hasNullAlleles)
-               return inputVC;
-           // now we can recreate new genotypes with trimmed alleles
-            for ( Map.Entry<String, Genotype> sample : inputVC.getGenotypes().entrySet() ) {
-
-                List<Allele> originalAlleles = sample.getValue().getAlleles();
-                List<Allele> trimmedAlleles = new ArrayList<Allele>();
-                for ( Allele a : originalAlleles ) {
-                    if ( a.isCalled() )
-                        trimmedAlleles.add(originalToTrimmedAlleleMap.get(a));
-                    else
-                        trimmedAlleles.add(Allele.NO_CALL);
-                }
-                genotypes.put(sample.getKey(), Genotype.modifyAlleles(sample.getValue(), trimmedAlleles));
-
-            }
-            return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), inputVC.filtersWereApplied() ? inputVC.getFilters() : null, attributes, new Byte(inputVC.getReference().getBases()[0]));
-
-        }
-
-        return inputVC;
-    }
-
    public final static boolean canDecodeFile(final File potentialInput, final String MAGIC_HEADER_LINE) {
        try {
            return isVCFStream(new FileInputStream(potentialInput), MAGIC_HEADER_LINE) ||
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContext.java
@ -209,6 +209,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati

    /**
     * the complete constructor.  Makes a complete VariantContext from its arguments
+     * This is the only constructor that is able to create indels! DO NOT USE THE OTHER ONES.
     *
     * @param source          source
     * @param contig          the contig
@ -257,9 +258,10 @@ public class VariantContext implements Feature { // to enable tribble intergrati
     * @param negLog10PError  qual
     * @param filters         filters: use null for unfiltered and empty set for passes filters
     * @param attributes      attributes
+     * @param referenceBaseForIndel   padded reference base
     */
-    public VariantContext(String source, String contig, long start, long stop, Collection<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes) {
-        this(source, contig, start, stop, alleles, NO_GENOTYPES, negLog10PError, filters, attributes, null, true);
+    public VariantContext(String source, String contig, long start, long stop, Collection<Allele> alleles, double negLog10PError, Set<String> filters, Map<String, ?> attributes, Byte referenceBaseForIndel) {
+        this(source, contig, start, stop, alleles, NO_GENOTYPES, negLog10PError, filters, attributes, referenceBaseForIndel, true);
    }

    /**
@ -293,7 +295,7 @@ public class VariantContext implements Feature { // to enable tribble intergrati
    }

    /**
-     * Create a new variant context without genotypes and no Perror, no filters, and no attributes
+     * Create a new variant context with genotypes but without Perror, filters, and attributes
     *
     * @param source          source
     * @param contig          the contig
--- a/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java
+++ b/public/java/src/org/broadinstitute/sting/utils/variantcontext/VariantContextUtils.java
@ -657,12 +657,84 @@ public class VariantContextUtils {

        VariantContext merged = new VariantContext(name, loc.getContig(), loc.getStart(), loc.getStop(), alleles, genotypes, negLog10PError, filters, (mergeInfoWithMaxAC ? attributesWithMaxAC : attributes) );
        // Trim the padded bases of all alleles if necessary
-        merged = AbstractVCFCodec.createVariantContextWithTrimmedAlleles(merged);
+        merged = createVariantContextWithTrimmedAlleles(merged);

        if ( printMessages && remapped ) System.out.printf("Remapped => %s%n", merged);
        return merged;
    }

+    public static VariantContext createVariantContextWithTrimmedAlleles(VariantContext inputVC) {
+        // see if we need to trim common reference base from all alleles
+        boolean trimVC;
+
+        // We need to trim common reference base from all alleles in all genotypes if a ref base is common to all alleles
+        Allele refAllele = inputVC.getReference();
+        if (!inputVC.isVariant())
+            trimVC = false;
+        else if (refAllele.isNull())
+            trimVC = false;
+        else {
+            trimVC = (AbstractVCFCodec.computeForwardClipping(new ArrayList<Allele>(inputVC.getAlternateAlleles()),
+                    inputVC.getReference().getDisplayString()) > 0);
+         }
+
+        // nothing to do if we don't need to trim bases
+        if (trimVC) {
+            List<Allele> alleles = new ArrayList<Allele>();
+            Map<String, Genotype> genotypes = new TreeMap<String, Genotype>();
+
+            // set the reference base for indels in the attributes
+            Map<String,Object> attributes = new TreeMap<String,Object>(inputVC.getAttributes());
+
+            Map<Allele, Allele> originalToTrimmedAlleleMap = new HashMap<Allele, Allele>();
+
+            for (Allele a : inputVC.getAlleles()) {
+                if (a.isSymbolic()) {
+                    alleles.add(a);
+                    originalToTrimmedAlleleMap.put(a, a);
+                } else {
+                    // get bases for current allele and create a new one with trimmed bases
+                    byte[] newBases = Arrays.copyOfRange(a.getBases(), 1, a.length());
+                    Allele trimmedAllele = Allele.create(newBases, a.isReference());
+                    alleles.add(trimmedAllele);
+                    originalToTrimmedAlleleMap.put(a, trimmedAllele);
+                }
+            }
+
+            // detect case where we're trimming bases but resulting vc doesn't have any null allele. In that case, we keep original representation
+            // example: mixed records such as {TA*,TGA,TG}
+            boolean hasNullAlleles = false;
+
+            for (Allele a: originalToTrimmedAlleleMap.values()) {
+                if (a.isNull())
+                    hasNullAlleles = true;
+                if (a.isReference())
+                    refAllele = a;
+             }
+
+             if (!hasNullAlleles)
+               return inputVC;
+           // now we can recreate new genotypes with trimmed alleles
+            for ( Map.Entry<String, Genotype> sample : inputVC.getGenotypes().entrySet() ) {
+
+                List<Allele> originalAlleles = sample.getValue().getAlleles();
+                List<Allele> trimmedAlleles = new ArrayList<Allele>();
+                for ( Allele a : originalAlleles ) {
+                    if ( a.isCalled() )
+                        trimmedAlleles.add(originalToTrimmedAlleleMap.get(a));
+                    else
+                        trimmedAlleles.add(Allele.NO_CALL);
+                }
+                genotypes.put(sample.getKey(), Genotype.modifyAlleles(sample.getValue(), trimmedAlleles));
+
+            }
+            return new VariantContext(inputVC.getSource(), inputVC.getChr(), inputVC.getStart(), inputVC.getEnd(), alleles, genotypes, inputVC.getNegLog10PError(), inputVC.filtersWereApplied() ? inputVC.getFilters() : null, attributes, new Byte(inputVC.getReference().getBases()[0]));
+
+        }
+
+        return inputVC;
+    }
+
    public static Map<String, Genotype> stripPLs(Map<String, Genotype> genotypes) {
        Map<String, Genotype> newGs = new HashMap<String, Genotype>(genotypes.size());

--- a/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/gatk/walkers/variantutils/SelectVariantsIntegrationTest.java
@ -77,6 +77,19 @@ public class SelectVariantsIntegrationTest extends WalkerTest {
        executeTest("testConcordance--" + testFile, spec);
    }

+    @Test
+    public void testVariantTypeSelection() {
+        String testFile = validationDataLocation + "complexExample1.vcf";
+
+        WalkerTestSpec spec = new WalkerTestSpec(
+                "-T SelectVariants -R " + b36KGReference + " -restrictAllelesTo MULTIALLELIC -selectType MIXED --variant " + testFile + " -o %s -NO_HEADER",
+                1,
+                Arrays.asList("e0b12c0b47a8a7a988b3587b47bfa8cf")
+        );
+
+        executeTest("testVariantTypeSelection--" + testFile, spec);
+    }
+
    @Test(enabled=false)
    public void testRemovePLs() {
        String testFile = validationDataLocation + "combine.3.vcf";
--- a/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaIntegrationTest.java
+++ b/public/java/test/org/broadinstitute/sting/jna/drmaa/v1_0/LibDrmaaIntegrationTest.java
@ -40,6 +40,7 @@ import java.util.Arrays;
 import java.util.List;

 public class LibDrmaaIntegrationTest extends BaseTest {
+    private String implementation = null;

    @Test
    public void testDrmaa() throws Exception {
@ -79,10 +80,24 @@ public class LibDrmaaIntegrationTest extends BaseTest {
            Assert.fail(String.format("Could not get DRMAA implementation from the DRMAA library: %s", error.getString(0)));

        System.out.println(String.format("DRMAA implementation(s): %s", drmaaImplementation.getString(0)));
+
+        this.implementation = drmaaImplementation.getString(0);
    }

-    @Test
+    @Test(dependsOnMethods = { "testDrmaa" })
    public void testSubmitEcho() throws Exception {
+        if (implementation.indexOf("LSF") >= 0) {
+            System.err.println("    *********************************************************");
+            System.err.println("   ***********************************************************");
+            System.err.println("   ****                                                   ****");
+            System.err.println("  ****  Skipping LibDrmaaIntegrationTest.testSubmitEcho()  ****");
+            System.err.println("  ****     Are you using the dotkit .combined_LSF_SGE?     ****");
+            System.err.println("   ****                                                   ****");
+            System.err.println("   ***********************************************************");
+            System.err.println("    *********************************************************");
+            return;
+        }
+
        Memory error = new Memory(LibDrmaa.DRMAA_ERROR_STRING_BUFFER);
        int errnum;

@ -129,7 +144,7 @@ public class LibDrmaaIntegrationTest extends BaseTest {
            errnum = LibDrmaa.drmaa_set_vector_attribute(jt, LibDrmaa.DRMAA_V_ARGV, args, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN);

            if (errnum != LibDrmaa.DRMAA_ERRNO.DRMAA_ERRNO_SUCCESS)
-                Assert.fail(String.format("Could not set attribute \"%s\": %s", LibDrmaa.DRMAA_REMOTE_COMMAND, error.getString(0)));
+                Assert.fail(String.format("Could not set attribute \"%s\": %s", LibDrmaa.DRMAA_V_ARGV, error.getString(0)));

            errnum = LibDrmaa.drmaa_run_job(jobIdMem, LibDrmaa.DRMAA_JOBNAME_BUFFER_LEN, jt, error, LibDrmaa.DRMAA_ERROR_STRING_BUFFER_LEN);

--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/DataProcessingPipeline.scala
@ -106,7 +106,7 @@ class DataProcessingPipeline extends QScript {
  // Because the realignment only happens after these scripts are executed, in case you are using
  // bwa realignment, this function will operate over the original bam files and output over the
  // (to be realigned) bam files.
-  def createSampleFiles(bamFiles: List[File], realignedBamFiles: List[File]): Map[String, File] = {
+  def createSampleFiles(bamFiles: List[File], realignedBamFiles: List[File]): Map[String, List[File]] = {

    // Creating a table with SAMPLE information from each input BAM file
    val sampleTable = scala.collection.mutable.Map.empty[String, List[File]]
@ -131,24 +131,25 @@ class DataProcessingPipeline extends QScript {
          sampleTable(sample) :+= rBam
      }
    }
+    return sampleTable.toMap

-    println("\n\n*** INPUT FILES ***\n")
-    // Creating one file for each sample in the dataset
-    val sampleBamFiles = scala.collection.mutable.Map.empty[String, File]
-    for ((sample, flist) <- sampleTable) {
-
-      println(sample + ":")
-      for (f <- flist)
-        println (f)
-      println()
-
-      val sampleFileName = new File(qscript.outputDir + qscript.projectName + "." + sample + ".list")
-      sampleBamFiles(sample) = sampleFileName
-      add(writeList(flist, sampleFileName))
-    }
-    println("*** INPUT FILES ***\n\n")
-
-    return sampleBamFiles.toMap
+//    println("\n\n*** INPUT FILES ***\n")
+//    // Creating one file for each sample in the dataset
+//    val sampleBamFiles = scala.collection.mutable.Map.empty[String, File]
+//    for ((sample, flist) <- sampleTable) {
+//
+//      println(sample + ":")
+//      for (f <- flist)
+//        println (f)
+//      println()
+//
+//      val sampleFileName = new File(qscript.outputDir + qscript.projectName + "." + sample + ".list")
+//      sampleBamFiles(sample) = sampleFileName
+//      //add(writeList(flist, sampleFileName))
+//    }
+//    println("*** INPUT FILES ***\n\n")
+//
+//    return sampleBamFiles.toMap
  }

  // Rebuilds the Read Group string to give BWA
@ -224,7 +225,10 @@ class DataProcessingPipeline extends QScript {


  def script = {
+    // final output list of processed bam files
+    var cohortList: List[File] = List()

+    // sets the model for the Indel Realigner
    cleanModelEnum = getIndelCleaningModel()

    // keep a record of the number of contigs in the first bam file in the list
@ -233,28 +237,19 @@ class DataProcessingPipeline extends QScript {

    val realignedBAMs = if (useBWApe || useBWAse) {performAlignment(bams)} else {revertBams(bams)}

-    // Generate a BAM file per sample joining all per lane files if necessary
-    val sampleBAMFiles: Map[String, File] = createSampleFiles(bams, realignedBAMs)
+    // generate a BAM file per sample joining all per lane files if necessary
+    val sampleBAMFiles: Map[String, List[File]] = createSampleFiles(bams, realignedBAMs)

-    // Final output list of processed bam files
-    var cohortList: List[File] = List()
-
-    // Simple progress report
-    println("\nFound the following samples: ")
-    for ((sample, file) <- sampleBAMFiles)
-      println("\t" + sample + " -> " + file)
-    println("\n")
-
-    // If this is a 'knowns only' indel realignment run, do it only once for all samples.
+    // if this is a 'knowns only' indel realignment run, do it only once for all samples.
    val globalIntervals = new File(outputDir + projectName + ".intervals")
    if (cleaningModel == ConsensusDeterminationModel.KNOWNS_ONLY)
      add(target(null, globalIntervals))

-    // Put each sample through the pipeline
-    for ((sample, sampleFile) <- sampleBAMFiles) {
-      val bam = if (sampleFile.endsWith(".list")) {swapExt(sampleFile, ".list", ".bam")} else {sampleFile}
+    // put each sample through the pipeline
+    for ((sample, bamList) <- sampleBAMFiles) {

      // BAM files generated by the pipeline
+      val bam        = new File(qscript.projectName + "." + sample + ".bam")
      val cleanedBam = swapExt(bam, ".bam", ".clean.bam")
      val dedupedBam = swapExt(bam, ".bam", ".clean.dedup.bam")
      val recalBam   = swapExt(bam, ".bam", ".clean.dedup.recal.bam")
@ -272,15 +267,16 @@ class DataProcessingPipeline extends QScript {

      // Validation is an optional step for the BAM file generated after
      // alignment and the final bam file of the pipeline.
-      if (!noValidation && sampleFile.endsWith(".bam")) {        // todo -- implement validation for .list BAM files
+      if (!noValidation) {
+        for (sampleFile <- bamList)
        add(validate(sampleFile, preValidateLog),
            validate(recalBam, postValidateLog))
      }

      if (cleaningModel != ConsensusDeterminationModel.KNOWNS_ONLY)
-        add(target(sampleFile, targetIntervals))
+        add(target(bamList, targetIntervals))

-      add(clean(sampleFile, targetIntervals, cleanedBam),
+      add(clean(bamList, targetIntervals, cleanedBam),
          dedup(cleanedBam, dedupedBam, metricsFile),
          cov(dedupedBam, preRecalFile),
          recal(dedupedBam, preRecalFile, recalBam),
@ -320,9 +316,9 @@ class DataProcessingPipeline extends QScript {
      this.maxRecordsInRam = 100000
  }

-  case class target (inBams: File, outIntervals: File) extends RealignerTargetCreator with CommandLineGATKArgs {
+  case class target (inBams: List[File], outIntervals: File) extends RealignerTargetCreator with CommandLineGATKArgs {
    if (cleanModelEnum != ConsensusDeterminationModel.KNOWNS_ONLY)
-      this.input_file :+= inBams
+      this.input_file = inBams
    this.out = outIntervals
    this.mismatchFraction = 0.0
    this.known :+= qscript.dbSNP
@ -333,8 +329,8 @@ class DataProcessingPipeline extends QScript {
    this.jobName = queueLogDir + outIntervals + ".target"
  }

-  case class clean (inBams: File, tIntervals: File, outBam: File) extends IndelRealigner with CommandLineGATKArgs {
-    this.input_file :+= inBams
+  case class clean (inBams: List[File], tIntervals: File, outBam: File) extends IndelRealigner with CommandLineGATKArgs {
+    this.input_file = inBams
    this.targetIntervals = tIntervals
    this.out = outBam
    this.known :+= qscript.dbSNP
--- a/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala
+++ b/public/scala/qscript/org/broadinstitute/sting/queue/qscripts/PacbioProcessingPipeline.scala
@ -4,9 +4,9 @@ import org.broadinstitute.sting.queue.QScript
 import org.broadinstitute.sting.queue.extensions.gatk._
 import org.broadinstitute.sting.queue.util.QScriptUtils
 import net.sf.samtools.SAMFileHeader.SortOrder
-import org.broadinstitute.sting.queue.extensions.picard.{SortSam, AddOrReplaceReadGroups}
 import org.broadinstitute.sting.utils.exceptions.UserException
 import org.broadinstitute.sting.commandline.Hidden
+import org.broadinstitute.sting.queue.extensions.picard.{ReorderSam, SortSam, AddOrReplaceReadGroups}

 /**
 * Created by IntelliJ IDEA.
@ -16,7 +16,7 @@ import org.broadinstitute.sting.commandline.Hidden
 */


-class RecalibrateBaseQualities extends QScript {
+class PacbioProcessingPipeline extends QScript {

  @Input(doc="input FASTA/FASTQ/BAM file - or list of FASTA/FASTQ/BAM files. ", shortName="i", required=true)
  var input: File = _
@ -39,13 +39,16 @@ class RecalibrateBaseQualities extends QScript {
  @Input(doc="The path to the binary of bwa to align fasta/fastq files", fullName="path_to_bwa", shortName="bwa", required=false)
  var bwaPath: File = _

+  @Input(doc="Input is a BLASR generated BAM file", shortName = "blasr", fullName="blasr_bam", required=false)
+  var BLASR_BAM: Boolean = false
+
  @Hidden
  @Input(doc="The default base qualities to use before recalibration. Default is Q20 (should be good for every dataset)." , shortName = "dbq", required=false)
  var dbq: Int = 20

-
-
-
+  @Hidden
+  @Input(shortName="bwastring", required=false)
+  var bwastring: String = ""

  val queueLogDir: String = ".qlog/"

@ -57,8 +60,6 @@ class RecalibrateBaseQualities extends QScript {

      var USE_BWA: Boolean = false

-      println("DEBUG: processing " + file + "\nDEBUG: name -- " + file.getName)
-
      if (file.endsWith(".fasta") || file.endsWith(".fq")) {
        if (bwaPath == null) {
          throw new UserException("You provided a fasta/fastq file but didn't provide the path for BWA");
@ -69,28 +70,34 @@ class RecalibrateBaseQualities extends QScript {
      // FASTA -> BAM steps
      val alignedSam: File = file.getName + ".aligned.sam"
      val sortedBam: File = swapExt(alignedSam, ".sam", ".bam")
-      val qualBam: File = swapExt(sortedBam, ".bam", ".q.bam")
      val rgBam: File = swapExt(file, ".bam", ".rg.bam")

      val bamBase = if (USE_BWA) {rgBam} else {file}

      // BAM Steps
+      val mqBAM: File = swapExt(bamBase, ".bam", ".mq.bam")
      val recalFile1: File = swapExt(bamBase, ".bam", ".recal1.csv")
      val recalFile2: File = swapExt(bamBase, ".bam", ".recal2.csv")
      val recalBam: File   = swapExt(bamBase, ".bam", ".recal.bam")
      val path1: String    = recalBam + ".before"
      val path2: String    = recalBam + ".after"

-
      if (USE_BWA) {
        add(align(file, alignedSam),
            sortSam(alignedSam, sortedBam),
-            addQuals(sortedBam, qualBam, dbq),
-            addReadGroup(qualBam, rgBam, sample))
+            addReadGroup(sortedBam, rgBam, sample))
      }

-      add(cov(bamBase, recalFile1),
-          recal(bamBase, recalFile1, recalBam),
+      else if (BLASR_BAM) {
+        val reorderedBAM = swapExt(bamBase, ".bam", ".reordered.bam")
+        add(reorder(bamBase, reorderedBAM),
+            changeMQ(reorderedBAM, mqBAM))
+      }
+
+      val bam = if (BLASR_BAM) {mqBAM} else {bamBase}
+
+      add(cov(bam, recalFile1),
+          recal(bam, recalFile1, recalBam),
          cov(recalBam, recalFile2),
          analyzeCovariates(recalFile1, path1),
          analyzeCovariates(recalFile2, path2))
@ -110,13 +117,13 @@ class RecalibrateBaseQualities extends QScript {


  case class align(@Input inFastq: File, @Output outSam: File) extends ExternalCommonArgs {
-    def commandLine = bwaPath + " bwasw -b5 -q2 -r1 -z10 -t8 " + reference + " " + inFastq + " > " + outSam
+    def commandLine = bwaPath + " bwasw -b5 -q2 -r1 -z20 -t16 " + reference + " " + inFastq + " > " + outSam
+    this.memoryLimit = 8
    this.analysisName = queueLogDir + outSam + ".bwa_sam_se"
    this.jobName = queueLogDir + outSam + ".bwa_sam_se"
  }

-  case class sortSam (@Input inSam: File, @Output outBam: File) extends SortSam with ExternalCommonArgs {
-    @Output(doc="output bai file") var bai = swapExt(outBam, ".bam", ".bai")
+  case class sortSam (inSam: File, outBam: File) extends SortSam with ExternalCommonArgs {
    this.input = List(inSam)
    this.output = outBam
    this.sortOrder = SortOrder.coordinate
@ -124,10 +131,16 @@ class RecalibrateBaseQualities extends QScript {
    this.jobName = queueLogDir + outBam + ".sortSam"
  }

-  case class addQuals(inBam: File, outBam: File, qual: Int) extends PrintReads with CommandLineGATKArgs {
+  case class reorder (inSam: File, outSam: File) extends ReorderSam with ExternalCommonArgs {
+    this.input = List(inSam)
+    this.output = outSam
+    this.sortReference = reference
+  }
+
+  case class changeMQ(inBam: File, outBam: File) extends PrintReads with CommandLineGATKArgs {
    this.input_file :+= inBam
    this.out = outBam
-    this.DBQ = qual
+    this.read_filter :+= "ReassignMappingQuality"
  }

  case class addReadGroup (inBam: File, outBam: File, sample: String) extends AddOrReplaceReadGroups with ExternalCommonArgs {
@ -145,7 +158,8 @@ class RecalibrateBaseQualities extends QScript {
  }

  case class cov (inBam: File, outRecalFile: File) extends CountCovariates with CommandLineGATKArgs {
-    this.rodBind :+= RodBind("dbsnp", "VCF", dbSNP)
+    this.DBQ = dbq
+    this.knownSites :+= dbSNP
    this.covariate ++= List("ReadGroupCovariate", "QualityScoreCovariate", "CycleCovariate", "DinucCovariate")
    this.input_file :+= inBam
    this.recal_file = outRecalFile
@ -155,6 +169,7 @@ class RecalibrateBaseQualities extends QScript {
  }

  case class recal (inBam: File, inRecalFile: File, outBam: File) extends TableRecalibration with CommandLineGATKArgs {
+    this.DBQ = dbq
    this.input_file :+= inBam
    this.recal_file = inRecalFile
    this.out = outBam
--- a/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala
+++ b/public/scala/src/org/broadinstitute/sting/queue/extensions/picard/ReorderSam.scala
@ -0,0 +1,48 @@
+package org.broadinstitute.sting.queue.extensions.picard
+
+import org.broadinstitute.sting.commandline._
+
+import java.io.File
+/*
+ * Created by IntelliJ IDEA.
+ * User: carneiro
+ * Date: 6/22/11
+ * Time: 10:35 AM
+ */
+class ReorderSam extends org.broadinstitute.sting.queue.function.JavaCommandLineFunction with PicardBamFunction {
+  analysisName = "ReorderSam"
+  javaMainClass = "net.sf.picard.sam.ReorderSam"
+
+  @Input(doc="Input file (bam or sam) to extract reads from.", shortName = "input", fullName = "input_bam_files", required = true)
+  var input: List[File] = Nil
+
+  @Output(doc="Output file (bam or sam) to write extracted reads to.", shortName = "output", fullName = "output_bam_file", required = true)
+  var output: File = _
+
+  @Output(doc="The output bam index", shortName = "out_index", fullName = "output_bam_index_file", required = false)
+  var outputIndex: File = _
+
+  @Argument(doc="Reference sequence to reorder reads to match.", shortName = "ref", fullName = "sort_reference", required = true)
+  var sortReference: File = _
+
+  @Argument(doc="If true, then allows only a partial overlap of the BAM contigs with the new reference sequence contigs. By default, this tool requires a corresponding contig in the new reference for each read contig.", shortName = "aic", fullName = "allow_incomplete_concordance", required = false)
+  var ALLOW_INCOMPLETE_DICT_CONCORDANCE: Boolean = _
+
+  @Argument(doc="If true, then permits mapping from a read contig to a new reference contig with the same name but a different length. Highly dangerous, only use if you know what you are doing.", shortName = "acld", fullName = "allow_contig_length_discordance", required = false)
+  var ALLOW_CONTIG_LENGTH_DISCORDANCE: Boolean = _
+
+  override def freezeFieldValues() {
+    super.freezeFieldValues()
+    if (outputIndex == null && output != null)
+      outputIndex = new File(output.getName.stripSuffix(".bam") + ".bai")
+  }
+
+  override def inputBams = input
+  override def outputBam = output
+  this.createIndex = Some(true)
+  this.sortOrder = null
+  override def commandLine = super.commandLine +
+    " REFERENCE=" + sortReference +
+    optional(" ALLOW_INCOMPLETE_DICT_CONCORDANCE=", ALLOW_INCOMPLETE_DICT_CONCORDANCE)
+    optional(" ALLOW_CONTIG_LENGTH_DISCORDANCE=", ALLOW_CONTIG_LENGTH_DISCORDANCE)
+}