Somewhat major update. Changes:

- ProduceBeagleInputWalker + Now takes a validation ROD and a prior to give it, will use those genotypes in place of the variant genotypes if both are present + Takes a bootstrap argument -- can use some given %age of the validation sites + Optionally takes a bootstrap output argument -- re-prints the validation VCF, filtering those sites used as part of the bootstrap -BeagleOutputToVCFWalker + Now filters sites where the genotypes have been reverted to hom ref + Now calls in to the new VCUtils to calculate AC/AN -Queue + New pipeline libraries for easy qscript creation, still a work in progress, but this is a considerable prototype + full calling pipeline v2 uses the above libraries + minor changes to some of my own scripts + no more need for contig interval lists, these will be parsed out of your normal interval list when it is provided git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4459 348d0f76-0448-11de-a6fe-93d51630548a
2010-10-08 13:30:28 +00:00 · 2010-10-08 13:30:28 +00:00 · 21ec44339d
parent e02f837659
commit 21ec44339d
12 changed files with 1053 additions and 131 deletions
--- a/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java
@ -32,6 +32,7 @@ import org.broadinstitute.sting.commandline.Argument;
 import org.broadinstitute.sting.commandline.Output;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
+import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
 import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
 import org.broadinstitute.sting.gatk.refdata.features.beagle.BeagleFeature;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
@ -161,6 +162,7 @@ public class BeagleOutputToVCFWalker  extends RodWalker<Integer, Integer> {
        int numGenotypesChangedByBeagle = 0;
        Integer alleleCountH = 0, chrCountH = 0;
        Double alleleFrequencyH = 0.0;
+        int beagleVarCounts = 0;

        Map<String,Genotype> hapmapGenotypes = null;

@ -284,31 +286,26 @@ public class BeagleOutputToVCFWalker  extends RodWalker<Integer, Integer> {
                originalAttributes.put("OG",".");
            }
            Genotype imputedGenotype = new Genotype(originalGenotypes.getKey(), alleles, genotypeQuality, filters,originalAttributes , genotypeIsPhased);
-
+            if ( imputedGenotype.isHet() || imputedGenotype.isHomVar() ) {
+                beagleVarCounts++;
+            }

            genotypes.put(originalGenotypes.getKey(), imputedGenotype);

        }

-        VariantContext filteredVC = new VariantContext("outputvcf", vc_input.getChr(), vc_input.getStart(), vc_input.getEnd(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.filtersWereApplied() ? vc_input.getFilters() : null, vc_input.getAttributes());
-
-        Set<Allele> altAlleles = filteredVC.getAlternateAlleles();
-        StringBuffer altAlleleCountString = new StringBuffer();
-        for ( Allele allele : altAlleles ) {
-            if ( altAlleleCountString.length() > 0 )
-                altAlleleCountString.append(",");
-            altAlleleCountString.append(filteredVC.getChromosomeCount(allele));
+        VariantContext filteredVC;
+        if ( beagleVarCounts > 0)
+            filteredVC = new VariantContext("outputvcf", vc_input.getChr(), vc_input.getStart(), vc_input.getEnd(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.filtersWereApplied() ? vc_input.getFilters() : null, vc_input.getAttributes());
+        else {
+            Set<String> removedFilters = vc_input.filtersWereApplied() ? new HashSet<String>(vc_input.getFilters()) : new HashSet<String>(1);
+            removedFilters.add(String.format("BGL_RM_WAS_%s/%s",vc_input.getReference().getBaseString(),vc_input.getAlternateAllele(0)));
+            filteredVC = new VariantContext("outputvcf", vc_input.getChr(), vc_input.getStart(), vc_input.getEnd(), new HashSet<Allele>(Arrays.asList(vc_input.getReference())), genotypes, vc_input.getNegLog10PError(), removedFilters, vc_input.getAttributes());
        }

        HashMap<String, Object> attributes = new HashMap<String, Object>(filteredVC.getAttributes());
-        if ( filteredVC.getChromosomeCount() > 0 ) {
-            attributes.put(VCFConstants.ALLELE_NUMBER_KEY, String.format("%d", filteredVC.getChromosomeCount()));
-            if ( altAlleleCountString.length() > 0 )  {
-                attributes.put(VCFConstants.ALLELE_COUNT_KEY, altAlleleCountString.toString());
-                attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, String.format("%4.2f",
-                        Double.valueOf(altAlleleCountString.toString())/(filteredVC.getChromosomeCount())));
-            }
-        }
+        // re-compute chromosome counts
+        VariantContextUtils.calculateChromosomeCounts(filteredVC, attributes, false);

        // Get Hapmap AC and AF
        if (vc_comp != null) {
@ -322,8 +319,7 @@ public class BeagleOutputToVCFWalker  extends RodWalker<Integer, Integer> {
        attributes.put("R2", beagleR2Feature.getR2value().toString() );


-
-        vcfWriter.add(VariantContext.modifyAttributes(filteredVC, attributes), ref.getBase());
+        vcfWriter.add(VariantContext.modifyAttributes(filteredVC,attributes), ref.getBase());


        return 1;
--- a/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java
+++ b/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java
@ -28,13 +28,14 @@ package org.broadinstitute.sting.gatk.walkers.beagle;
 import org.broad.tribble.util.variantcontext.Allele;
 import org.broad.tribble.util.variantcontext.Genotype;
 import org.broad.tribble.util.variantcontext.VariantContext;
-import org.broad.tribble.vcf.VCFConstants;
-import org.broad.tribble.vcf.VCFWriter;
+import org.broad.tribble.vcf.*;
 import org.broadinstitute.sting.commandline.Argument;
+import org.broadinstitute.sting.commandline.Hidden;
 import org.broadinstitute.sting.commandline.Output;
 import org.broadinstitute.sting.gatk.contexts.AlignmentContext;
 import org.broadinstitute.sting.gatk.contexts.ReferenceContext;
 import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils;
+import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource;
 import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker;
 import org.broadinstitute.sting.gatk.walkers.RodWalker;
 import org.broadinstitute.sting.gatk.walkers.RMD;
@ -42,6 +43,8 @@ import org.broadinstitute.sting.gatk.walkers.Requires;
 import org.broadinstitute.sting.utils.GenomeLoc;
 import org.broadinstitute.sting.utils.MathUtils;
 import org.broadinstitute.sting.utils.SampleUtils;
+import org.broadinstitute.sting.utils.exceptions.StingException;
+import org.broadinstitute.sting.utils.vcf.VCFUtils;

 import java.io.PrintStream;
 import java.util.*;
@ -53,6 +56,7 @@ import java.util.*;
 public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {

    public static final String ROD_NAME = "variant";
+    public static final String VALIDATION_ROD_NAME = "validation";

    @Output(doc="File to which BEAGLE input should be written",required=true)
    protected PrintStream  beagleWriter = null;
@ -61,9 +65,24 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
    public PrintStream beagleGenotypesWriter  = null;
    @Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false)
    public double insertedNoCallRate  = 0;
+    @Argument(fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false)
+    public double validationPrior = -1.0;
+    @Argument(fullName = "validation_bootstrap", shortName = "bs", doc = "Proportion of records to be used in bootstrap set", required = false)
+    public double bootstrap = 0.0;
+    @Argument(fullName = "bootstrap_vcf",shortName = "bvcf", doc = "Output a VCF with the records used for bootstrapping filtered out", required = false)
+    VCFWriter bootstrapVCFOutput = null;
+
+    @Hidden
+    @Argument(fullName = "variant_genotype_ptrue", shortName = "varp", doc = "Flat probability prior to assign to variant (not validation) genotypes. Does not override GL field.", required = false)
+    public double variantPrior = 0.96;
+

    private Set<String> samples = null;
+    private Set<String> BOOTSTRAP_FILTER = new HashSet<String>( Arrays.asList("bootstrap") );
    private Random generator;
+    private int bootstrapCount = -1;
+    private int bootstrapSetSize = 0;
+    private int testSetSize = 0;

    public void initialize() {
        generator = new Random();
@ -77,114 +96,176 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {
        beagleWriter.println();
        if (beagleGenotypesWriter != null)
            beagleGenotypesWriter.println("dummy header");
+
+        if ( bootstrapVCFOutput != null ) {
+            initializeVcfWriter();
+        }
    }

    public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) {

-         if( tracker != null ) {
+        if( tracker != null ) {
            GenomeLoc loc = context.getLocation();
-            VariantContext vc_eval;
+            VariantContext variant_eval;
+            VariantContext validation_eval;

-            vc_eval = tracker.getVariantContext(ref, ROD_NAME, null, loc, false);
-            if ( vc_eval == null || vc_eval.isFiltered() )
+            variant_eval = tracker.getVariantContext(ref, ROD_NAME, null, loc, false);
+            validation_eval = tracker.getVariantContext(ref,VALIDATION_ROD_NAME,null,loc,false);
+            if ( goodSite(variant_eval,validation_eval) ) {
+                if ( useValidation(variant_eval,validation_eval, ref) ) {
+                    writeBeagleOutput(validation_eval,variant_eval,true,validationPrior);
+                } else {
+                    if ( goodSite(variant_eval) ) {
+                        writeBeagleOutput(variant_eval,validation_eval,false,variantPrior);
+                        return 1;
+                    } else { // todo -- if the variant site is bad, validation is good, but not in bootstrap set -- what do?
+                        return 0;
+                    }
+                }
+                return 1;
+            } else {
                return 0;
-
-             if (!vc_eval.isSNP())
-                     return 0;
-
-             if (!vc_eval.isBiallelic())
-                     return 0;
-             
-            // output marker ID to Beagle input file
-            beagleWriter.print(String.format("%s ", VariantContextUtils.getLocation(vc_eval).toString()));
-
-            if (beagleGenotypesWriter != null)
-                beagleGenotypesWriter.print(String.format("%s ", VariantContextUtils.getLocation(vc_eval).toString()));
-
-            for (Allele allele: vc_eval.getAlleles()) {
-                // TODO -- check whether this is really needed by Beagle
-                String bglPrintString;
-                if (allele.isNoCall() || allele.isNull())
-                    bglPrintString = "0";
-                else
-                    bglPrintString = allele.toString().substring(0,1);  // get rid of * in case of reference allele
-
-                beagleWriter.print(String.format("%s ", bglPrintString));
            }
-
-            if ( !vc_eval.hasGenotypes() )
-                return 0;
-
-            Map<String, Genotype> genotypes = vc_eval.getGenotypes();
-            for ( String sample : samples ) {
-                // use sample as key into genotypes structure
-                Genotype genotype = genotypes.get(sample);
-                if (genotype.isCalled() && genotype.hasAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) {
-                    String[] glArray = genotype.getAttributeAsString(VCFConstants.GENOTYPE_LIKELIHOODS_KEY).split(",");
-
-                    double[] likeArray = new double[glArray.length];
-
-                    // convert to double array so we can normalize
-                    int k=0;
-                    for (String gl : glArray) {
-                        likeArray[k++] = Double.valueOf(gl);
-                    }
-
-                    double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(likeArray);
-                    // see if we need to randomly mask out genotype in this position.
-                    Double d = generator.nextDouble();
-                    if (d > insertedNoCallRate ) {
-//                    System.out.format("%5.4f ", d);
-                        for (Double likeVal: normalizedLikelihoods)
-                            beagleWriter.print(String.format("%5.4f ",likeVal));
-                    }
-                    else {
-                        // we are masking out this genotype
-                        beagleWriter.print("0.33 0.33 0.33 ");
-                    }
-
-                    if (beagleGenotypesWriter != null) {
-                        char a = genotype.getAllele(0).toString().charAt(0);
-                        char b = genotype.getAllele(0).toString().charAt(0);
-
-                        beagleGenotypesWriter.format("%c %c ", a, b);
-                    }
-                }
-                else if (genotype.isCalled() && !genotype.hasAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) {
-                    // hack to deal with input VCFs with no genotype likelihoods.  Just assume the called genotype
-                    // is confident.  This is useful for Hapmap and 1KG release VCFs.
-                    double AA = 0.02;
-                    double AB = 0.02;
-                    double BB = 0.02;
-
-                    if (genotype.isHomRef()) { AA = 0.96; }
-                    else if (genotype.isHet()) { AB = 0.96; }
-                    else if (genotype.isHomVar()) { BB = 0.96; }
-
-                    beagleWriter.printf("%.2f %.2f %.2f ", AA, AB, BB);
-
-                    if (beagleGenotypesWriter != null) {
-                        char a = genotype.getAllele(0).toString().charAt(0);
-                        char b = genotype.getAllele(0).toString().charAt(0);
-
-                        beagleGenotypesWriter.format("%c %c ", a, b);
-                    }
-                }
-                else  {
-                    beagleWriter.print("0.33 0.33 0.33 "); // write 1/3 likelihoods for uncalled genotypes.
-                    if (beagleGenotypesWriter != null)
-                         beagleGenotypesWriter.print(". . ");
-                }
-            }
-
-            beagleWriter.println();
-            if (beagleGenotypesWriter != null)
-                 beagleGenotypesWriter.println();
+        } else {
+            return 0;
        }
-        return 1;
-        
    }

+    public boolean goodSite(VariantContext a, VariantContext b) {
+        return goodSite(a) || goodSite(b);
+    }
+
+    public boolean goodSite(VariantContext v) {
+       return v != null && ! v.isFiltered() && v.isSNP() && v.isBiallelic() && v.hasGenotypes();
+    }
+
+    public boolean useValidation(VariantContext variant, VariantContext validation, ReferenceContext ref) {
+        if( goodSite(validation) ) {
+            // if using record keeps us below expected proportion, use it
+            logger.debug(String.format("boot: %d, test: %d, total: %d", bootstrapSetSize, testSetSize, bootstrapSetSize+testSetSize+1));
+            if ( (bootstrapSetSize+1.0)/(1.0+bootstrapSetSize+testSetSize) <= bootstrap ) {
+                if ( bootstrapVCFOutput != null ) {
+                    bootstrapVCFOutput.add(VariantContext.modifyFilters(validation, BOOTSTRAP_FILTER), ref.getBase() );
+                }
+                bootstrapSetSize ++;
+                return true;
+            } else {
+                if ( bootstrapVCFOutput != null ) {
+                    bootstrapVCFOutput.add(validation,ref.getBase());
+                }
+                testSetSize++;
+                return false;
+            }
+        } else {
+            if ( validation != null && bootstrapVCFOutput != null ) {
+                bootstrapVCFOutput.add(validation,ref.getBase());
+            }
+            return false;
+        }
+    }
+
+    public void writeBeagleOutput(VariantContext preferredVC, VariantContext otherVC, boolean isValidationSite, double prior) {
+        beagleWriter.print(String.format("%s ",VariantContextUtils.getLocation(preferredVC).toString()));
+        if ( beagleGenotypesWriter != null ) {
+            beagleGenotypesWriter.print(String.format("%s ",VariantContextUtils.getLocation(preferredVC).toString()));
+        }
+
+        for ( Allele allele : preferredVC.getAlleles() ) {
+            String bglPrintString;
+            if (allele.isNoCall() || allele.isNull())
+                bglPrintString = "0";
+            else
+                bglPrintString = allele.toString().substring(0,1);  // get rid of * in case of reference allele
+
+            beagleWriter.print(String.format("%s ", bglPrintString));
+        }
+
+        Map<String,Genotype> preferredGenotypes = preferredVC.getGenotypes();
+        Map<String,Genotype> otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null;
+        boolean isValidation;
+        for ( String sample : samples ) {
+            Genotype genotype;
+            // use sample as key into genotypes structure
+            if ( preferredGenotypes.keySet().contains(sample) ) {
+                genotype = preferredGenotypes.get(sample);
+                isValidation = isValidationSite;
+            } else if ( otherGenotypes != null && otherGenotypes.keySet().contains(sample) ) {
+                genotype = otherGenotypes.get(sample);
+                isValidation = ! isValidationSite;
+            } else {
+                // there is magically no genotype for this sample.
+                throw new StingException("Sample "+sample+" arose with no genotype in variant or validation VCF. This should never happen.");
+            }
+            /**
+             * Use likelihoods if: is validation, prior is negative; or: is not validation, has genotype key
+             */
+            if ( (isValidation && prior < 0.0) || genotype.isCalled() && genotype.hasAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) {
+                String[] glArray = genotype.getAttributeAsString(VCFConstants.GENOTYPE_LIKELIHOODS_KEY).split(",");
+
+                double[] likeArray = new double[glArray.length];
+
+                // convert to double array so we can normalize
+                int k=0;
+                for (String gl : glArray) {
+                    likeArray[k++] = Double.valueOf(gl);
+                }
+
+                double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(likeArray);
+                // see if we need to randomly mask out genotype in this position.
+                Double d = generator.nextDouble();
+                if (d > insertedNoCallRate ) {
+//                    System.out.format("%5.4f ", d);
+                    for (Double likeVal: normalizedLikelihoods)
+                        beagleWriter.print(String.format("%5.4f ",likeVal));
+                }
+                else {
+                    // we are masking out this genotype
+                    beagleWriter.print("0.33 0.33 0.33 ");
+                }
+
+                if (beagleGenotypesWriter != null) {
+                    char a = genotype.getAllele(0).toString().charAt(0);
+                    char b = genotype.getAllele(0).toString().charAt(0);
+
+                    beagleGenotypesWriter.format("%c %c ", a, b);
+                }
+            }
+            /**
+             * otherwise, use the prior uniformly
+             */
+            else if (! isValidation && genotype.isCalled() && !genotype.hasAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) {
+                // hack to deal with input VCFs with no genotype likelihoods.  Just assume the called genotype
+                // is confident.  This is useful for Hapmap and 1KG release VCFs.
+                double AA = (1.0-prior)/2.0;
+                double AB = (1.0-prior)/2.0;
+                double BB = (1.0-prior)/2.0;
+
+                if (genotype.isHomRef()) { AA = prior; }
+                else if (genotype.isHet()) { AB = prior; }
+                else if (genotype.isHomVar()) { BB = prior; }
+
+                beagleWriter.printf("%.2f %.2f %.2f ", AA, AB, BB);
+
+                if (beagleGenotypesWriter != null) {
+                    char a = genotype.getAllele(0).toString().charAt(0);
+                    char b = genotype.getAllele(0).toString().charAt(0);
+
+                    beagleGenotypesWriter.format("%c %c ", a, b);
+                }
+            }
+            else  {
+                beagleWriter.print("0.33 0.33 0.33 "); // write 1/3 likelihoods for uncalled genotypes.
+                if (beagleGenotypesWriter != null)
+                    beagleGenotypesWriter.print(". . ");
+            }
+        }
+
+        beagleWriter.println();
+        if (beagleGenotypesWriter != null)
+            beagleGenotypesWriter.println();
+    }
+
+
    public Integer reduceInit() {
        return 0; // Nothing to do here
    }
@ -195,6 +276,19 @@ public class ProduceBeagleInputWalker extends RodWalker<Integer, Integer> {

    public void onTraversalDone( Integer sum ) {

-   }
-    
+    }
+
+    private void initializeVcfWriter() {
+
+        final ArrayList<String> inputNames = new ArrayList<String>();
+        inputNames.add( VALIDATION_ROD_NAME );
+
+        // setup the header fields
+        Set<VCFHeaderLine> hInfo = new HashSet<VCFHeaderLine>();
+        hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames));
+        hInfo.add(new VCFFilterHeaderLine("bootstrap","This site used for genotype bootstrapping with ProduceBeagleInputWalker"));
+
+        bootstrapVCFOutput.writeHeader(new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames)));
+    }
+
 }
--- a/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java
+++ b/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java
@ -54,4 +54,27 @@ public class BeagleIntegrationTest extends WalkerTest {
        executeTest("test BeagleInput", spec);
    }

+    @Test
+    public void testBeagleInput2() {
+        WalkerTestSpec spec = new WalkerTestSpec(
+                "-T ProduceBeagleInput -B:variant,VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878_HSQ_chr22_14-16m.vcf "+
+                        "-B:validation,VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878_OMNI_chr22_14-16m.vcf "+
+                        "-L 22:14000000-16000000 -o %s -bvcf %s -bs 0.8 -valp 0.98 -R /humgen/1kg/reference/human_g1k_v37.fasta -NO_HEADER ",2,
+                Arrays.asList("44d28b6b092d5f4c0ae59af442612ea3","481c58f8309916184a33ab1835e5cc48"));
+        executeTest("test BeagleInputWithBootstrap",spec);
+    }
+
+    @Test
+    public void testBeagleOutput2() {
+        WalkerTestSpec spec = new WalkerTestSpec(
+                "-T BeagleOutputToVCF -R "+hg19Reference+" "+
+                "-B:variant,VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.vcf "+
+                "-B:beagleR2,beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.r2 "+
+                "-B:beagleProbs,beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.gprobs.bgl "+
+                "-B:beaglePhased,beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.phased.bgl "+
+                "-L 20:1-70000 -o %s -NO_HEADER ",1,Arrays.asList("fd104b7da9e6c99b662c6ad746fd53f3"));
+
+        executeTest("testBeagleChangesSitesToRef",spec);
+    }
+
 }
--- a/scala/qscript/chartl/fullCallingPipelineV2.q
+++ b/scala/qscript/chartl/fullCallingPipelineV2.q
@ -0,0 +1,122 @@
+import com.sun.tools.doclets.internal.toolkit.util.DocFinder.Input
+import org.broadinstitute.sting.gatk.{CommandLineGATK, DownsampleType}
+import java.io.File
+import net.sf.picard.reference.FastaSequenceFile
+import org.broadinstitute.sting.commandline.Argument
+import org.broadinstitute.sting.datasources.pipeline.Pipeline
+import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model
+import org.broadinstitute.sting.queue.extensions.gatk._
+import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
+import org.broadinstitute.sting.queue.extensions.samtools._
+import org.broadinstitute.sting.queue.pipeline.{BamProcessing,VariantCalling}
+import org.broadinstitute.sting.queue.{QException, QScript}
+import collection.JavaConversions._
+import org.broadinstitute.sting.utils.yaml.YamlUtils
+import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType
+class fullCallingPipelineV2 extends QScript {
+  qscript =>
+
+  @Argument(doc="Number of cleaning jobs", shortName="cleaningJobs")
+  var cleaningJobs: Int = _
+
+  @Argument(doc="the YAML file specifying inputs, interval lists, reference sequence, etc.", shortName="Y")
+  var yamlFile: File = _
+
+  @Input(doc="path to trigger track (for UnifiedGenotyper)", shortName="trigger", required=false)
+  var trigger: File = _
+
+  @Input(doc="path to refseqTable (for GenomicAnnotator)", shortName="refseqTable")
+  var refseqTable: File = _
+
+  @Input(doc="path to Picard FixMateInformation.jar.  See http://picard.sourceforge.net/ .", required=false)
+  var picardFixMatesJar: File = new java.io.File("/seq/software/picard/current/bin/FixMateInformation.jar")
+
+  @Input(doc="path to GATK jar")
+  var gatkJar: File = _
+
+  @Input(doc="target Ti/Tv ratio for recalibration", shortName="titv", required=true)
+  var target_titv: Float = _
+
+  @Input(doc="per-sample downsampling level",shortName="dcov",required=false)
+  var downsampling_coverage = 300
+
+  @Input(doc="level of parallelism for UnifiedGenotyper", shortName="snpScatter", required=false)
+  var num_snp_scatter_jobs = 20
+
+  @Input(doc="level of parallelism for IndelGenotyperV2", shortName="indelScatter", required=false)
+  var num_indel_scatter_jobs = 5
+
+  @Input(doc="Skip indel-cleaning for BAM files (for testing only)", shortName="skipCleaning", required=false)
+  var skip_cleaning = false
+
+  //@Input(doc="ADPR script")
+  //var adprScript: File = _
+
+  //@Input(doc="Sequencing maching name (for use by adpr)")
+  //var machine: String = _
+
+  //@Input(doc="Sequencing experiement type (for use by adpr)--Whole_Exome, Whole_Genome, or Hybrid_Selection")
+  //var protocol: String = _
+
+  private var pipeline: Pipeline = _
+
+  trait CommandLineGATKArgs extends CommandLineGATK {
+    this.intervals = qscript.pipeline.getProject.getIntervalList
+    this.jarFile = qscript.gatkJar
+    this.reference_sequence = qscript.pipeline.getProject.getReferenceFile
+    this.memoryLimit = Some(4)
+  }
+
+
+  // ------------ SETUP THE PIPELINE ----------- //
+
+
+  def script = {
+    pipeline = YamlUtils.load(classOf[Pipeline], qscript.yamlFile)
+    var callingLib: VariantCalling = new VariantCalling(qscript.yamlFile,qscript.gatkJar)
+    var cleaningLib: BamProcessing = new BamProcessing(qscript.yamlFile,qscript.gatkJar,qscript.picardFixMatesJar)
+
+    val projectBase: String = qscript.pipeline.getProject.getName
+    val cleanedBase: String = projectBase + ".cleaned"
+    val uncleanedBase: String = projectBase + ".uncleaned"
+
+    // there are commands that use all the bam files
+    val recalibratedSamples = qscript.pipeline.getSamples.filter(_.getBamFiles.contains("recalibrated"))
+
+    var bamsToClean: List[(File,File)] = Nil
+    var recalBams: List[File] = Nil
+    var cleanedBams: List[File] = Nil
+
+    for ( sample <- recalibratedSamples ) {
+      val bam = sample.getBamFiles.get("recalibrated")
+      recalBams :+= bam
+      if (!sample.getBamFiles.contains("cleaned")) {
+        sample.getBamFiles.put("cleaned", swapExt(bam,"bam","cleaned.bam"))
+        bamsToClean :+= (bam,sample.getBamFiles.get("cleaned"))
+      }
+
+      cleanedBams :+= sample.getBamFiles.get("cleaned")
+    }
+
+    if ( !qscript.skip_cleaning ) {
+      cleaningLib.StandardIndelRealign(bamsToClean,qscript.cleaningJobs)
+    }
+
+    if (!qscript.skip_cleaning) {
+      endToEnd(cleanedBase, cleanedBams, lib)
+    } else {
+      endToEnd(uncleanedBase, recalBams, lib)
+    }
+  }
+
+
+  def endToEnd(base: String, bamFiles: List[File], lib: VariantCalling) = {
+    var recal_vcf = base+"_snps.recal.annotated.tranched.vcf"
+    var handfilt_vcf = base+"_snps.handfiltered.annotated.vcf"
+    var indel_vcf = base+"_indel_calls.vcf"
+
+    for ( c <- lib.StandardCallingPipeline(bamFiles,indel_vcf,recal_vcf,handfilt_vcf,qscript.target_titv,qscript.refseqTable) ) {
+      add(c)
+    }
+  }
+}
--- a/scala/qscript/chartl/omni_bootstrap_refine.q
+++ b/scala/qscript/chartl/omni_bootstrap_refine.q
@ -0,0 +1,124 @@
+import java.io.{FileReader, File, BufferedReader}
+import net.sf.picard.reference.FastaSequenceFile
+import org.broadinstitute.sting.datasources.pipeline.Pipeline
+import org.broadinstitute.sting.gatk.DownsampleType
+import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model
+import org.broadinstitute.sting.queue.extensions.gatk._
+import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
+import org.broadinstitute.sting.queue.extensions.samtools._
+import org.broadinstitute.sting.queue.{QException, QScript}
+import collection.JavaConversions._
+import org.broadinstitute.sting.utils.yaml.YamlUtils
+import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType
+
+class omni_bootstrap_refine extends QScript {
+ => script
+
+  val ref = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
+  val gatkJar = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar")
+
+  trait FooTrait extends CommandLineGATK {
+    this.reference_sequence = script.ref
+    this.jarFile = script.gatkJar
+    this.intervalsString :+= "20"
+  }
+
+  class SampleOverlap extends CommandLineFunction {
+    @Input(doc="omni vcf") var omni: File = _
+    @Input(doc="other vcf") var other: File = _
+    @Output(doc="output sn file") var out : File = _
+
+    def commandLine = "%s ; %s ; %s ; %s".format(this.cmd1, this.cmd2, this.cmd3, this.cmd4)
+
+    def cmd1 : String = {
+      return "head -n 500 %s | grep #CHR | cut -f10- | tr '\t' '\n' | sort > temp1".format(omni.getAbsolutePath)
+    }
+
+    def cmd2 : String = {
+      return "head -n 500 %s | grep #CHR | cut -f10- | tr '\t' '\n'| sort > temp2".format(other.getAbsolutePath) 
+    }
+
+    def cmd3 : String = {
+      return "cat temp1 temp2 | sort | uniq -c | awk '{if($1 == 2) print $2}' > %s".format(out.getAbsolutePath)
+    }
+
+    def cmd4 : String = {
+      return "rm temp1 temp2"
+    }
+  }
+
+  class BeagleRefinement extends CommandLineFunction {
+    @Input(doc="The beagle input file") var beagleInput: File = _
+    var beagleOutputBase: String = _
+    var beagleMemoryGigs: Int = 4
+
+    /**
+     * Note: These get set
+     */
+    @Output(doc="The beagle phased file") var beaglePhasedFile: File = _
+    @Output(doc="The beagle likelihood file") var beagleLikelihoods: File = _
+    @Output(doc="The beagle r2 file") var beagleRSquared: File = _
+    var beagleOutputDir: String = _
+
+    def freezeOutputs = {
+      if ( beagleInput.getParent == null ) {
+        beagleOutputDir = ""
+      } else {
+        beagleOutputDir = beagleInput.getParent
+      }
+      beaglePhasedFile = new File(beagleOutputDir+beagleOutputBase+"."+beagleInput.getName+".phased.gz")
+      beagleLikelihoods = new File(beagleOutputDir+beagleOutputBase+"."+beagleInput.getName+".gprobs.gz")
+      beagleRSquared = new File(beagleOutputDir+beagleOutputBase+"."+beagleInput.getName+".r2")
+    }
+
+    def commandLine = "java -Djava.io.tmpdir=%s -Xmx%dg -jar %s like=%s out=%s".format(beagleInput.getParent,beagleMemoryGigs,beagleJar,beagleInput.getAbsolutePath,beagleOutputBase)
+  }
+
+  def runme(pop: File, omni: File, base: String) : List[CommandLineFunction] = {
+    var clf : List[CommandLineFunction] = Nil
+    val s_overlap = new File(base+"_sample_overlap.txt")
+    var calcOverlap = new SampleOverlap
+    calcOverlap.omni = omni
+    calcOverlap.other = pop
+    calcOverlap.out = s_overlap
+
+    clf += calcOverlap
+
+    val subset_omni = swapExt(omni,".vcf","_%s_subset.vcf".format(base))
+    val subset_pop = swapExt(pop,".vcf","_%s_subset.vcf".format(base))
+
+    var omSubset = new SelectVariants with FooTrait
+    omSubset.variantVCF = omni
+    omSubset.sample = s_overlap.getAbsolutePath
+    omSubset.out = subset_omni
+
+    clf += omSubset
+
+    var popSubset = new SelectVariants with FooTrait
+    popSubset.variantVCF = pop
+    popSubset.sample = s_overlap.getAbsolutePath
+    popSubset.out = subset_pop
+
+    clf += popSubset
+
+    var bootRefine = new ProduceBeagleInput with FooTrait
+    bootRefine.variantVCF = popSubset
+    bootRefine.rodBind :+= new RodBind("validation","VCF",subset_omni)
+    bootRefine.bootstrap_vcf = swapExt(subset_omni,".vcf",".boot.vcf")
+    bootRefine.bootstrap = Some(2)
+    bootRefine.validation_genotype_ptrue = Some(0.98);
+    bootRefine.out = new File(base+".beagle_input.beagle")
+
+    clf += bootRefine
+
+    var runBeagle = new BeagleRefinement
+    runBeagle.beagleInput = bootRefine.out
+    runBeagle.beagleOutputBase = base+"_beagle"
+    runBeagle.freezeOutputs()
+
+    clf += runBeagle
+
+    var putBack = new BeagleOutputToVCF with FooTrait
+    putBack.
+  }
+}
--- a/scala/qscript/chartl/omni_qc.q
+++ b/scala/qscript/chartl/omni_qc.q
@ -14,22 +14,28 @@ import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType
 class omni_qc extends QScript {
  qscript =>

-  var pilot3_overlap_samples = new File("/humgen/gsa-scr1/chartl/projects/omni/resources/pilot3_overlap.txt")
-  var pilot1_overlap_samples = new File("/humgen/gsa-scr1/chartl/projects/omni/resources/pilot1_overlap.txt")
-  var production_overlap_samples = new File("/humgen/gsa-scr1/chartl/projects/omni/resources/production_overlap.txt")
-  var hiseq_overlap_samples = new File("/humgen/gsa-scr1/chartl/projects/omni/resources/NA12878_name.txt")
+  // NON-OMNI VCF FILES
+  var pilot3_release_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/pilot3/merge_release/ALL.exon.2010_03.genotypes.vcf","vcf")
+  var pilot1_ceu_vcf = new TaggedFile("/humgen/1kg/releases/pilot_project/2010_07/low_coverage/snps/CEU.low_coverage.2010_07.genotypes.vcf.gz","vcf")
+  var pilot1_chb_vcf = new TaggedFile("/humgen/1kg/releases/pilot_project/2010_07/low_coverage/snps/CHBJPT.low_coverage.2010_07.genotypes.vcf.gz","vcf")
+  var pilot1_yri_vcf = new TaggedFile("/humgen/1kg/releases/pilot_project/2010_07/low_coverage/snps/YRI.low_coverage.2010_07.genotypes.vcf.gz","vcf")
+  var august_calls_vcf = null
+  var hiseq_calls_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/NA12878.HiSeq.v9.b36.vcf.gz","vcf")
+  var pilot1_with_na12878_vcf = new TaggedFile("/humgen/1kg/analysis/bamsForDataProcessingPapers/lowpass_b36/calls/v2/N60/lowpass.N60.recal.mG6.retranche.vcf","vcf")

+  // OMNI VCF FILES
+  var OMNI_b36_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/Omni_2_5_795_b36.vcf","vcf")
+  var OMNI_b37_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/Omni_2_5_795_b37.deduped.vcf.gz","vcf")
+  var OMNI_hapmap_b36_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/Omni_2_5_pilot_b36.vcf.gz","vcf")
+
+  // INTERVALS
  var pilot3_interval_list: String = "/humgen/gsa-hpprojects/1kg/1kg_pilot3/documents/CenterSpecificTargetLists/results/p3overlap.targets.b36.interval_list"
  var pilot1_interval_list: String = _
  var hiseq_interval_list: String = _
  var production_interval_list: String = "20"

-  val Omni_chip_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/Omni_2_5_603samples.vcf.gz","vcf")
-  val Omni_b37_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/Omni_2_5_603samples.b37.vcf.gz","vcf")
-  val pilot3_vcf_broad = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/ALL.exon.2010_05.broad.genotypes.vcf.gz","vcf")
-  val pilot1_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/CEU.low_coverage.2010_07.genotypes.vcf.gz","vcf")
-  val production_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/ALL.production.2010_08.chr20.lowpass.genotypes.vcf","vcf")
-  val hiSeq_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/NA12878.HiSeq.v9.b36.vcf.gz","vcf")
+  // OTHER
+  var failed_qc_samples = new File("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/failed_qc_samples.txt")

  trait OmniArgs extends CommandLineGATK {
    this.reference_sequence = new File("/humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta")
@ -80,6 +86,10 @@ class omni_qc extends QScript {
    eval.reportLocation = new TaggedFile(outDir+base+".subset.omni.eval","eval")
    eval.reportType = Some(VE2TemplateType.R)
    eval.analysisName = base+"_Eval"
+    eval.select ++= List("AC==1","AC==2","AC==3","AC==4","AC==5","AC>5")
+    eval.selectName ++= List("AC1","AC2","AC3","AC4","AC5","AC6+")
+    eval.disI = true
+    eval.outputVCF = outDir+base+".eval.interesting.sites.vcf"

    if ( base.equalsIgnoreCase("production") ) {
      subset.variantVCF = Omni_b37_vcf
@ -88,6 +98,8 @@ class omni_qc extends QScript {
      eval.DBSNP = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.rod")
    }

+
+
    add(subset,makeOmniSiteList,eval)
  }

@ -96,5 +108,6 @@ class omni_qc extends QScript {
    runme(pilot3_overlap_samples,pilot3_interval_list,pilot3_vcf_broad,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/pilot3/","pilot3")
    runme(hiseq_overlap_samples,hiseq_interval_list,hiSeq_vcf,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/hiSeq/","NA12878_HiSeq")
    runme(production_overlap_samples,production_interval_list,production_vcf,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/production/","production")
+    runme(pilot3_overlap_samples,pilot3_interval_list,pilot3_vcf,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/pilot3/","pilot3_release")
  }
-}
+}
--- a/scala/src/org/broadinstitute/sting/queue/QScript.scala
+++ b/scala/src/org/broadinstitute/sting/queue/QScript.scala
@ -38,4 +38,5 @@ trait QScript extends Logging {
   * Adds one or more command line functions to be run.
   */
  def add(functions: CommandLineFunction*) = this.functions ++= List(functions:_*)
+
 }
--- a/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala
+++ b/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala
@ -12,6 +12,9 @@ import org.broadinstitute.sting.queue.function.scattergather.{SimpleTextGatherFu
 trait CommandLineFunction extends QFunction with Logging {
  def commandLine: String

+  /* Is it a gather function? */
+  var isGather: Boolean = true
+
  /** Upper memory limit */
  var memoryLimit: Option[Int] = None

--- a/scala/src/org/broadinstitute/sting/queue/pipeline/BamProcessing.scala
+++ b/scala/src/org/broadinstitute/sting/queue/pipeline/BamProcessing.scala
@ -0,0 +1,138 @@
+package org.broadinstitute.sting.queue.pipeline
+
+import org.broadinstitute.sting.queue.extensions.gatk._
+import java.io.File
+import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
+import org.broadinstitute.sting.queue.function.CommandLineFunction
+import org.broadinstitute.sting.utils.yaml.YamlUtils
+import org.broadinstitute.sting.datasources.pipeline.Pipeline
+import net.sf.picard.reference.ReferenceSequenceFileFactory
+import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser}
+import org.broadinstitute.sting.utils.interval.IntervalUtils
+import collection.mutable.{ListBuffer, HashMap}
+import collection.JavaConversions
+import java.util.Arrays
+import org.broadinstitute.sting.queue.util.{PipelineUtils, IOUtils}
+import org.broadinstitute.sting.commandline.{Output, Input}
+import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction
+
+class BamProcessing(yaml: File, gatkJar: File, fixMatesJar: File) {
+  library =>
+
+  var attributes: Pipeline = YamlUtils.load(classOf[Pipeline],yaml)
+
+  trait StandardCommandLineGATK extends CommandLineGATK {
+    this.reference_sequence = library.attributes.getProject.getReferenceFile
+    this.intervals = library.attributes.getProject.getIntervalList
+    this.DBSNP = library.attributes.getProject.getDbsnpFile
+    this.memoryLimit = Some(2)
+    this.jarFile = library.gatkJar
+  }
+
+  /**
+   * @Doc: Creates a standard realigner target creator CLF given a bam, an output file, and the contigs over which to run
+   * @Returns: A CLF for the realigner target creator job
+   */
+  protected def StandardRealignerTargetCreator(bam: File, contigs: List[String], output: File) : RealignerTargetCreator = {
+    var rtc = new RealignerTargetCreator with StandardCommandLineGATK
+    rtc.intervals = null
+    rtc.intervalsString = contigs
+    rtc.input_file :+= bam
+    rtc.out = output
+    rtc.analysisName = "RealignerTargetCreator"
+
+    return rtc
+  }
+
+  /**
+   * @Doc: Creates a standard indel cleaner CLF given a bam, the results of the target creator, and an output .bam file
+   * @Returns: A CLF for the indel cleaning job
+   */
+  protected def StandardIndelCleaner(bam: File, contigs: List[String], targets: File, outBam: File) : IndelRealigner = {
+    var realigner = new IndelRealigner with StandardCommandLineGATK
+    realigner.intervalsString = contigs
+    realigner.intervals = null
+    realigner.input_file :+= bam
+    realigner.out = outBam
+    realigner.targetIntervals = targets
+    realigner.analysisName = "IndelClean"
+    realigner.bam_compression = Some(0)
+
+    return realigner    
+  }
+
+  /**
+   * @Doc: Creates a standard split-by-contig indel cleaner job for a given bam file, RTC output, and bam to merge everything to
+   * @Returns: A list of CLFs (todo -- wrapped in a Pipeline)
+   */
+  protected def StandardIndelCleanBam(bam: File, jobContigs: List[List[String]], targets: File, cleanedBam: File) : List[CommandLineFunction] = {
+    var cmds : List[CommandLineFunction] = Nil
+    var jobSpecs : List[(File,File,List[String])] = jobContigs.map[(File,File,List[String]),List[(File,File,List[String])]](
+      ctigs => { (bam, swapExt(bam,".bam",".%s.bam".format(ctigs.mkString("_"))), ctigs) }
+      )
+    var bamsToMerge : List[File] = Nil
+    for ( spec <- jobSpecs ) {
+      cmds :+= StandardIndelCleaner(spec._1,spec._3,targets,spec._2)
+      bamsToMerge :+= spec._2
+    }
+
+    cmds :+= StandardPicardFixMates(bamsToMerge,cleanedBam,library.fixMatesJar)
+
+    return cmds
+
+  }
+
+  /**
+   * @Doc: Given a list of (pairs of) bams and cleaned bams to write to, and a number of jobs, creates a set of
+   * command line functions to do the target-creating, splitting, cleaning, and merging, returning that list
+   * of command line functions
+   * @Returns: A list of command line functions for the full indel realignment pipeline from the collection
+   * of uncleaned bams to the collection of cleaned bams
+   */
+  protected def StandardIndelRealign( bamsUncleanCleanPairs: List[(File,File)], nJobs: Int = 1 ) : List[CommandLineFunction] = {
+    val contigsForJobs : List[List[String]] = PipelineUtils.smartSplitContigs(library.attributes.getProject.getReferenceFile, library.attributes.getProject.getIntervalList, nJobs)
+    var commands : List[CommandLineFunction] = Nil
+    for ( bamPair <- bamsUncleanCleanPairs ) {
+      val rtc : RealignerTargetCreator = StandardRealignerTargetCreator(bamPair._1,contigsForJobs.foldLeft[List[String]](Nil)( (a,b) => a ::: b), swapExt(bamPair._1,".bam",".targets") )
+      val icbs : List[CommandLineFunction] = StandardIndelCleanBam(bamPair._1,contigsForJobs,rtc.out,bamPair._2)
+      val sam : SamtoolsIndexFunction = new SamtoolsIndexFunction
+      sam.bamFile = bamPair._2
+      sam.analysisName = "SamtoolsIndex"
+      commands :+= rtc
+      commands ++= icbs
+      commands :+= sam
+    }
+
+    return commands
+  }
+
+  /**
+   * @Doc: Merges N bam files into one bam file, fixing mate pairs in the process; does not assume they are sorted
+   * @Returns: Command line function for the merge, fix-mate, and sort operation
+   */
+  protected def StandardPicardFixMates(inBams: List[File], outBam: File, picardJar: File) : CommandLineFunction = {
+    var pfm : PicardFixMates = new PicardFixMates
+    pfm.bams = inBams
+    pfm.outBam = outBam
+    pfm.jarFile = picardJar
+    pfm.assumeSorted = Some(false)
+    pfm.memoryLimit = Some(4)
+    pfm.analysisName = "FixMates"
+
+    return pfm
+  }
+
+  class PicardFixMates extends PicardBamJarFunction {
+    @Input(doc="input bam files") var bams: List[File] = Nil
+    @Output(doc="output bam file") var outBam: File = null
+
+    def inputBams: List[File] = bams
+    def outputBam: File = outBam
+
+  }
+
+
+  def swapExt(file: File, oldExtension: String, newExtension: String) =
+     new File(file.getName.stripSuffix(oldExtension) + newExtension)
+  
+}
--- a/scala/src/org/broadinstitute/sting/queue/pipeline/QPipeline.scala
+++ b/scala/src/org/broadinstitute/sting/queue/pipeline/QPipeline.scala
@ -0,0 +1,13 @@
+package org.broadinstitute.sting.queue.pipeline
+
+import org.broadinstitute.sting.queue.function.CommandLineFunction
+
+trait QPipeline {
+  var commands: List[CommandLineFunction] = Nil
+
+  def generateCommands
+
+  def track // todo -- implement me
+
+  def removeIntermediate // todo -- implement me
+}
--- a/scala/src/org/broadinstitute/sting/queue/pipeline/VariantCalling.scala
+++ b/scala/src/org/broadinstitute/sting/queue/pipeline/VariantCalling.scala
@ -0,0 +1,340 @@
+package org.broadinstitute.sting.queue.pipeline
+
+import java.io.File
+import net.sf.picard.reference.FastaSequenceFile
+import org.broadinstitute.sting.datasources.pipeline.Pipeline
+import org.broadinstitute.sting.gatk.DownsampleType
+import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model
+import org.broadinstitute.sting.queue.extensions.gatk._
+import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
+import org.broadinstitute.sting.queue.extensions.samtools._
+import org.broadinstitute.sting.queue.{QException, QScript}
+import collection.JavaConversions._
+import org.broadinstitute.sting.utils.yaml.YamlUtils
+import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType
+class VariantCalling(yaml: File,gatkJar: File) {
+  vc =>
+
+  // load attributes
+  var attributes: Pipeline = YamlUtils.load(classOf[Pipeline], yaml)
+  
+  /**
+   * Trait to propagate basic attributes throughout the library
+   */
+  trait StandardCommandLineGATK extends CommandLineGATK {
+    this.reference_sequence = vc.attributes.getProject.getReferenceFile
+    this.intervals = vc.attributes.getProject.getIntervalList
+    this.DBSNP = vc.attributes.getProject.getDbsnpFile
+    // set global memory limit on the low side. Additional input bams will affect it.
+    this.memoryLimit = Some(2)
+    this.jarFile = vc.gatkJar
+  }
+
+  /**
+   * @Doc: Creates a standard UnifiedGenotyper CLF from input bams and an output file
+   * @Return: UnifiedGenotyper with the standard GSA arguments
+   * @TODO: Add a formula: f(#bams)=memory; allow yaml to specify triggers and perhaps other information
+   */
+  protected def StandardUnifiedGenotyper(bams : List[File], output : File) : UnifiedGenotyper = {
+    var ug = new UnifiedGenotyper with StandardCommandLineGATK
+    ug.analysisName = "UnifiedGenotyper"
+    ug.input_file = bams
+    ug.group :+= "Standard"
+    ug.out = output
+    ug.min_base_quality_score = Some(10)
+    ug.min_mapping_quality_score = Some(10)
+    ug.cap_base_quality_by_mapping_quality = true
+    ug.standard_min_confidence_threshold_for_emitting = Some(10)
+    ug.standard_min_confidence_threshold_for_calling = Some(30)
+    ug.trigger_min_confidence_threshold_for_calling = Some(0)
+    ug.downsample_to_coverage = Some(300)
+    ug.dt = Some(DownsampleType.BY_SAMPLE)
+    ug.scatterCount = 50
+
+    if ( bams.size > 125 ) {
+      ug.memoryLimit = Some(4)
+    }
+
+    return ug
+
+  }
+
+  /**
+   * @Doc: Creates a CLF to call indels on a specific .bam file, outputting to a given output file
+   * @Returns: An IndelGenotyperV2 CLF with standard GSA arguments
+   */
+  protected def StandardIndelGenotyper(bam : File, output: File) : IndelGenotyperV2 = {
+    var ig = new IndelGenotyperV2 with StandardCommandLineGATK
+    ig.analysisName = "IndelGenotyper"
+    ig.input_file :+= bam
+    ig.out = output
+    ig.downsample_to_coverage = Some(300)
+    return ig
+  }
+
+  /**
+   * @Doc: Accessor method to StandardIndelGenotyper that allows it to be marked as a scatter job in a pipeline
+   * @Returns: An IndelGenotyperV2 CLF with standard GSA arguments, marked as a scatter job
+   */
+  private def StandardIndelGenotyper(bam : File, output: File, gather: Boolean) : IndelGenotyperV2 = {
+    var ig = StandardIndelGenotyper(bam,output)
+    ig.isGather = gather
+    return ig
+  }
+
+  /**
+   * @Doc: Combines a list of indel VCFs to a single output file
+   * @Returns: A CombineVariants CLF with standard GSA arguments
+   */
+  protected def StandardIndelCombine( igList : List[IndelGenotyperV2], output : File ) : CombineVariants = {
+    var cv = new CombineVariants with StandardCommandLineGATK
+    cv.out = output
+    cv.genotypemergeoption = Some(org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.GenotypeMergeType.UNIQUIFY)
+    cv.variantmergeoption = Some(org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.VariantMergeType.UNION)
+    cv.analysisName = "IndelGenotyper"
+    cv.isGather = true
+    cv.priority = (igList.map[String,List[String]](ig => swapExt(ig.out,".vcf","").getAbsolutePath)).mkString(",")
+    //cv.priority = (igList.foldLeft[List[String]](Nil)( (prLs, ig) =>  prLs ::: List(swapExt(ig.out,".vcf","").getAbsolutePath))).mkString(",")
+    cv.rodBind = igList.map[RodBind,List[RodBind]](ig => new RodBind(swapExt(ig.out,".vcf","").getName,"VCF",ig.out))
+
+    return cv
+    
+  }
+
+  /**
+   * @Doc: Generates indel calls on a list of .bam files, and merges those calls into an output file. This is a small pipeline.
+   * @Returns: A list of CLFs that run indel calls and indel merging. User has zero control over individual indel VCF names.
+   */
+  protected def StandardIndelCalls ( bams : List[File], output : File ) : List[CommandLineGATK] = {
+    var genotypers = bams.foldLeft[List[IndelGenotyperV2]](Nil)( (igs,bam) => igs ::: List(this.StandardIndelGenotyper(bam, swapExt(bam,".bam",".indels.vcf"), false)))
+    var combine = this.StandardIndelCombine( genotypers, output )
+    var callFunctions: List[CommandLineGATK] = genotypers
+    callFunctions = combine :: callFunctions
+
+    return callFunctions    
+  }
+
+  protected def StandardFilterAtIndels ( snps: File, indels: File, output : File ) : VariantFiltration = {
+    var iFil = new VariantFiltration with StandardCommandLineGATK
+    iFil.analysisName = "FilterAtIndels"
+    iFil.out = output
+    iFil.mask = indels.getAbsolutePath
+    iFil.maskName = "Indel_Mask"
+    iFil.variantVCF = snps
+    // todo -- cluster size varies with # bams
+    iFil.clusterSize = Some(5)
+    iFil.clusterWindowSize = Some(8)
+
+    return iFil
+  }
+
+  protected def StandardHandfilter( snps: File, output: File ) : VariantFiltration = {
+    var hFil = new VariantFiltration with StandardCommandLineGATK
+    hFil.analysisName = "HandFilter"
+    hFil.out = output
+    hFil.variantVCF = snps
+    hFil.filterExpression :+= "QD<5"
+    hFil.filterName :+= "LowQualByDepth"
+    hFil.filterExpression :+= "AB>0.75"
+    hFil.filterName :+= "HighAlleleBalance"
+    hFil.filterExpression :+= "SB>-0.10"
+    hFil.filterName :+= "HighStrandBias"
+    
+    return hFil
+  }
+
+  protected def StandardVariantCluster( snps: File, output: File ) : GenerateVariantClusters = {
+    var genC = new GenerateVariantClusters with StandardCommandLineGATK
+    genC.analysisName = "VariantQualityRecalibration"
+    genC.rodBind :+= new RodBind("input","VCF",snps)
+    genC.cluster_file = output
+    genC.use_annotation :+= "QD"
+    genC.use_annotation :+= "SB"
+    genC.use_annotation :+= "HaplotypeScore"
+    genC.use_annotation :+= "AB"
+    genC.use_annotation :+= "HRun"
+
+    return genC
+  }
+
+  protected def StandardVariantRecalibrator ( raw_vcf: File, cluster: File, target_titv: scala.Double, out_vcf: File,
+                                              out_tranches: File, out_dat: File) : VariantRecalibrator = {
+    var vr = new VariantRecalibrator with StandardCommandLineGATK
+    vr.analysisName = "VariantQualityRecalibration"
+    vr.rodBind :+= new RodBind("input","VCF",raw_vcf)
+    vr.cluster_file = cluster
+    vr.target_titv = target_titv
+    vr.out = out_vcf
+    vr.tranches_file = out_tranches
+    vr.report_dat_file = out_dat
+    vr.tranche :+= "0.1"
+    vr.tranche :+= "1"
+    vr.tranche :+= "5"
+    vr.tranche :+= "10"
+
+    return vr
+
+  }
+
+  protected def StandardApplyVariantCuts( snpRecal: File, tranches: File, output: File) : ApplyVariantCuts = {
+    var avc = new ApplyVariantCuts with StandardCommandLineGATK
+    avc.analysisName = "VariantQualityRecalibration"
+    avc.rodBind :+= new RodBind("input","VCF",snpRecal)
+    avc.out = output
+    avc.tranches_file = tranches
+    avc.fdr_filter_level = Some(5)
+  
+    return avc
+  }
+
+  protected def StandardRecalibrateVariants( snps: File, targetTiTv: scala.Double, recalVcf: File) : List[CommandLineGATK] = {
+    var clust = StandardVariantCluster(snps, swapExt(snps,".vcf",".cluster"))
+    var recal = StandardVariantRecalibrator(snps,clust.clusterFile,targetTiTv,swapExt(snps,".vcf",".recal.vcf"),
+                swapExt(snps,".vcf",".recal.tranch"),swapExt(snps,".vcf",".recal.dat"))
+    var cut = StandardApplyVariantCuts(recal.out,recal.tranches_file,recal.out)
+
+    var cmds: List[CommandLineGATK] = Nil
+    cmds :+= clust
+    cmds :+= recal
+    cmds :+= cut
+
+    return cmds
+  }
+
+  protected def StandardGenomicAnnotation ( snps: File, refseqFile: File, outputVCF: File) : GenomicAnnotator = {
+    var ga = new GenomicAnnotator with StandardCommandLineGATK
+    ga.analysisName = "GenomicAnnotator"
+    ga.variantVCF = snps
+    ga.rodBind :+= new RodBind("refseq","AnnotatorInputTable",refseqFile)
+    ga.rodToIntervalTrackName = "variant"
+    ga.out = outputVCF
+
+    return ga
+  }
+
+  protected def StandardSNPCalls( bams: List[File], output: File, targetTiTv: scala.Double, refGene: File = null ) : List[CommandLineGATK] = {
+    var commands : List[CommandLineGATK] = Nil
+    var dir = ""
+
+    if ( output.getParent != null ) {
+      dir = output.getParent+"/"
+    }
+
+    var raw_snp = new File(dir+vc.attributes.getProject+".raw_snps.vcf")
+    var ug = StandardUnifiedGenotyper(bams, raw_snp)
+
+    commands :+= ug
+
+    var raw_indel = new File(dir+vc.attributes.getProject+".raw_indels.vcf")
+    var ig = StandardIndelCalls(bams,raw_indel)
+
+    commands ++= ig
+
+    var prefilt_snp = swapExt(raw_snp,".vcf",".indel_filtered.vcf")
+    var iFilt = StandardFilterAtIndels(raw_snp,raw_indel,prefilt_snp)
+
+    commands :+= iFilt
+
+    var annoSNP = prefilt_snp
+    if ( refGene != null ) {
+      annoSNP = swapExt(prefilt_snp,".vcf",".annotated.vcf")
+      var annotate = StandardGenomicAnnotation(prefilt_snp, refGene, annoSNP)
+
+      commands :+= annotate
+    }
+
+    var recal = StandardRecalibrateVariants(annoSNP, targetTiTv, output)
+
+    commands ++= recal
+
+    return commands
+  }
+
+  protected def StandardSNPCallsBothFilterTypes(bams: List[File], recalOut: File, handFilteredOut: File, targetTiTv: scala.Double, refGene: File = null ) : List[CommandLineGATK] = {
+    var commands : List[CommandLineGATK] = Nil
+    var dir = ""
+
+    if ( recalOut.getParent != null ) {
+      dir = recalOut.getParent+"/"
+    }
+
+    var raw_snp = new File(dir+vc.attributes.getProject+".raw_snps.vcf")
+    var ug = StandardUnifiedGenotyper(bams, raw_snp)
+
+    commands :+= ug
+
+    var raw_indel = new File(dir+vc.attributes.getProject+".raw_indels.vcf")
+    var ig = StandardIndelCalls(bams,raw_indel)
+
+    commands ++= ig
+
+    var prefilt_snp = swapExt(raw_snp,".vcf",".indel_filtered.vcf")
+    var iFilt = StandardFilterAtIndels(raw_snp,raw_indel,prefilt_snp)
+
+    commands :+= iFilt
+
+    var annoSNP = prefilt_snp
+    if ( refGene != null ) {
+      annoSNP = swapExt(prefilt_snp,".vcf",".annotated.vcf")
+      var annotate = StandardGenomicAnnotation(prefilt_snp, refGene, annoSNP)
+
+      commands :+= annotate
+    }
+
+    var recal = StandardRecalibrateVariants(annoSNP, targetTiTv, recalOut)
+
+    commands ++= recal
+
+    var handFilt = StandardHandfilter(prefilt_snp,handFilteredOut)
+
+    commands :+= handFilt
+
+    return commands
+  }
+
+  protected def StandardCallingPipeline(bams: List[File], indelOut: File, recalOut: File, handFilteredOut: File, targetTiTv: scala.Double, refGene: File = null ) : List[CommandLineGATK] = {
+    var commands : List[CommandLineGATK] = Nil
+    var dir = ""
+
+    if ( recalOut.getParent != null ) {
+      dir = recalOut.getParent+"/"
+    }
+
+    var raw_snp = new File(dir+vc.attributes.getProject+".raw_snps.vcf")
+    var ug = StandardUnifiedGenotyper(bams, raw_snp)
+
+    commands :+= ug
+
+    var raw_indel = indelOut
+    var ig = StandardIndelCalls(bams,raw_indel)
+
+    commands ++= ig
+
+    var prefilt_snp = swapExt(raw_snp,".vcf",".indel_filtered.vcf")
+    var iFilt = StandardFilterAtIndels(raw_snp,raw_indel,prefilt_snp)
+
+    commands :+= iFilt
+
+    var annoSNP = prefilt_snp
+    if ( refGene != null ) {
+      annoSNP = swapExt(prefilt_snp,".vcf",".annotated.vcf")
+      var annotate = StandardGenomicAnnotation(prefilt_snp, refGene, annoSNP)
+
+      commands :+= annotate
+    }
+
+    var recal = StandardRecalibrateVariants(annoSNP, targetTiTv, recalOut)
+
+    commands ++= recal
+
+    var handFilt = StandardHandfilter(prefilt_snp,handFilteredOut)
+
+    commands :+= handFilt
+
+    return commands
+  }
+  
+  def swapExt(file: File, oldExtension: String, newExtension: String) =
+    new File(file.getName.stripSuffix(oldExtension) + newExtension)
+
+}
--- a/scala/src/org/broadinstitute/sting/queue/util/PipelineUtils.scala
+++ b/scala/src/org/broadinstitute/sting/queue/util/PipelineUtils.scala
@ -0,0 +1,55 @@
+package org.broadinstitute.sting.queue.util
+
+import net.sf.picard.reference.ReferenceSequenceFileFactory
+import java.io.File
+import org.broadinstitute.sting.utils.GenomeLocParser
+import collection.JavaConversions._
+import org.broadinstitute.sting.utils.interval.IntervalUtils
+
+class PipelineUtils {
+
+}
+
+object PipelineUtils{
+
+  def smartSplitContigs(reference: File, intervals: File, sets: Int) : List[List[String]] = {
+    GenomeLocParser.setupRefContigOrdering(ReferenceSequenceFileFactory.getReferenceSequenceFile(reference))
+    val targets = IntervalUtils.parseIntervalArguments(List(intervals.getAbsolutePath), false)
+
+    // Build up a map of contigs with sizes.
+    var contigSizes = Map.empty[String, Long]
+    // todo -- make this look like functional code for Q's sake
+    //targets.foreach( loc => { contigSizes += loc -> { contigSizes.get(loc.getContig) match { case Some(size) => size + loc.size case None => loc.size } } })
+
+    for (loc <- targets) {
+      val contig = loc.getContig
+      val contigSize = loc.size
+      contigSizes += contig -> {
+        contigSizes.get(contig) match {
+          case Some(size) => size + contigSize
+          case None => contigSize
+        }
+      }
+    }
+
+    // Keep a list of pairs of sizes with lists of contigs.
+    var splitContigs = List.empty[(Long, List[String])]
+    for ((contigName, contigSize) <- contigSizes) {
+      if (splitContigs.size < sets) {
+        // If there are fewer than the requested number of sets, just add this contig.
+        splitContigs :+= contigSize -> List(contigName)
+      } else {
+        // If there is already a number of sets
+        // sort the contigs to get the smallest one first.
+        splitContigs = splitContigs.sortBy{case (size, contigs) => size}
+        // Update the pair with the new contig size and name.
+        var smallContigs = splitContigs.head
+        smallContigs = (smallContigs._1 + contigSize) -> (smallContigs._2 :+ contigName)
+        // Re add the pair to the list.
+        splitContigs = smallContigs :: splitContigs.tail
+      }
+    }
+
+    splitContigs.map{case (size, contigs) => contigs}
+  }
+}