From 21ec44339d51cd3afa8f3f8b6c1b70e2bc14bd51 Mon Sep 17 00:00:00 2001 From: chartl Date: Fri, 8 Oct 2010 13:30:28 +0000 Subject: [PATCH] Somewhat major update. Changes: - ProduceBeagleInputWalker + Now takes a validation ROD and a prior to give it, will use those genotypes in place of the variant genotypes if both are present + Takes a bootstrap argument -- can use some given %age of the validation sites + Optionally takes a bootstrap output argument -- re-prints the validation VCF, filtering those sites used as part of the bootstrap -BeagleOutputToVCFWalker + Now filters sites where the genotypes have been reverted to hom ref + Now calls in to the new VCUtils to calculate AC/AN -Queue + New pipeline libraries for easy qscript creation, still a work in progress, but this is a considerable prototype + full calling pipeline v2 uses the above libraries + minor changes to some of my own scripts + no more need for contig interval lists, these will be parsed out of your normal interval list when it is provided git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@4459 348d0f76-0448-11de-a6fe-93d51630548a --- .../beagle/BeagleOutputToVCFWalker.java | 34 +- .../beagle/ProduceBeagleInputWalker.java | 296 +++++++++------ .../walkers/beagle/BeagleIntegrationTest.java | 23 ++ scala/qscript/chartl/fullCallingPipelineV2.q | 122 +++++++ scala/qscript/chartl/omni_bootstrap_refine.q | 124 +++++++ scala/qscript/chartl/omni_qc.q | 35 +- .../broadinstitute/sting/queue/QScript.scala | 1 + .../queue/function/CommandLineFunction.scala | 3 + .../sting/queue/pipeline/BamProcessing.scala | 138 +++++++ .../sting/queue/pipeline/QPipeline.scala | 13 + .../sting/queue/pipeline/VariantCalling.scala | 340 ++++++++++++++++++ .../sting/queue/util/PipelineUtils.scala | 55 +++ 12 files changed, 1053 insertions(+), 131 deletions(-) create mode 100755 scala/qscript/chartl/fullCallingPipelineV2.q create mode 100755 scala/qscript/chartl/omni_bootstrap_refine.q create mode 100755 scala/src/org/broadinstitute/sting/queue/pipeline/BamProcessing.scala create mode 100755 scala/src/org/broadinstitute/sting/queue/pipeline/QPipeline.scala create mode 100755 scala/src/org/broadinstitute/sting/queue/pipeline/VariantCalling.scala create mode 100755 scala/src/org/broadinstitute/sting/queue/util/PipelineUtils.scala diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java index 909f9cd34..3979a16c2 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/beagle/BeagleOutputToVCFWalker.java @@ -32,6 +32,7 @@ import org.broadinstitute.sting.commandline.Argument; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; +import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.features.beagle.BeagleFeature; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; @@ -161,6 +162,7 @@ public class BeagleOutputToVCFWalker extends RodWalker { int numGenotypesChangedByBeagle = 0; Integer alleleCountH = 0, chrCountH = 0; Double alleleFrequencyH = 0.0; + int beagleVarCounts = 0; Map hapmapGenotypes = null; @@ -284,31 +286,26 @@ public class BeagleOutputToVCFWalker extends RodWalker { originalAttributes.put("OG","."); } Genotype imputedGenotype = new Genotype(originalGenotypes.getKey(), alleles, genotypeQuality, filters,originalAttributes , genotypeIsPhased); - + if ( imputedGenotype.isHet() || imputedGenotype.isHomVar() ) { + beagleVarCounts++; + } genotypes.put(originalGenotypes.getKey(), imputedGenotype); } - VariantContext filteredVC = new VariantContext("outputvcf", vc_input.getChr(), vc_input.getStart(), vc_input.getEnd(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.filtersWereApplied() ? vc_input.getFilters() : null, vc_input.getAttributes()); - - Set altAlleles = filteredVC.getAlternateAlleles(); - StringBuffer altAlleleCountString = new StringBuffer(); - for ( Allele allele : altAlleles ) { - if ( altAlleleCountString.length() > 0 ) - altAlleleCountString.append(","); - altAlleleCountString.append(filteredVC.getChromosomeCount(allele)); + VariantContext filteredVC; + if ( beagleVarCounts > 0) + filteredVC = new VariantContext("outputvcf", vc_input.getChr(), vc_input.getStart(), vc_input.getEnd(), vc_input.getAlleles(), genotypes, vc_input.getNegLog10PError(), vc_input.filtersWereApplied() ? vc_input.getFilters() : null, vc_input.getAttributes()); + else { + Set removedFilters = vc_input.filtersWereApplied() ? new HashSet(vc_input.getFilters()) : new HashSet(1); + removedFilters.add(String.format("BGL_RM_WAS_%s/%s",vc_input.getReference().getBaseString(),vc_input.getAlternateAllele(0))); + filteredVC = new VariantContext("outputvcf", vc_input.getChr(), vc_input.getStart(), vc_input.getEnd(), new HashSet(Arrays.asList(vc_input.getReference())), genotypes, vc_input.getNegLog10PError(), removedFilters, vc_input.getAttributes()); } HashMap attributes = new HashMap(filteredVC.getAttributes()); - if ( filteredVC.getChromosomeCount() > 0 ) { - attributes.put(VCFConstants.ALLELE_NUMBER_KEY, String.format("%d", filteredVC.getChromosomeCount())); - if ( altAlleleCountString.length() > 0 ) { - attributes.put(VCFConstants.ALLELE_COUNT_KEY, altAlleleCountString.toString()); - attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, String.format("%4.2f", - Double.valueOf(altAlleleCountString.toString())/(filteredVC.getChromosomeCount()))); - } - } + // re-compute chromosome counts + VariantContextUtils.calculateChromosomeCounts(filteredVC, attributes, false); // Get Hapmap AC and AF if (vc_comp != null) { @@ -322,8 +319,7 @@ public class BeagleOutputToVCFWalker extends RodWalker { attributes.put("R2", beagleR2Feature.getR2value().toString() ); - - vcfWriter.add(VariantContext.modifyAttributes(filteredVC, attributes), ref.getBase()); + vcfWriter.add(VariantContext.modifyAttributes(filteredVC,attributes), ref.getBase()); return 1; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java index 097421734..e8133ae34 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/beagle/ProduceBeagleInputWalker.java @@ -28,13 +28,14 @@ package org.broadinstitute.sting.gatk.walkers.beagle; import org.broad.tribble.util.variantcontext.Allele; import org.broad.tribble.util.variantcontext.Genotype; import org.broad.tribble.util.variantcontext.VariantContext; -import org.broad.tribble.vcf.VCFConstants; -import org.broad.tribble.vcf.VCFWriter; +import org.broad.tribble.vcf.*; import org.broadinstitute.sting.commandline.Argument; +import org.broadinstitute.sting.commandline.Hidden; import org.broadinstitute.sting.commandline.Output; import org.broadinstitute.sting.gatk.contexts.AlignmentContext; import org.broadinstitute.sting.gatk.contexts.ReferenceContext; import org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils; +import org.broadinstitute.sting.gatk.datasources.simpleDataSources.ReferenceOrderedDataSource; import org.broadinstitute.sting.gatk.refdata.RefMetaDataTracker; import org.broadinstitute.sting.gatk.walkers.RodWalker; import org.broadinstitute.sting.gatk.walkers.RMD; @@ -42,6 +43,8 @@ import org.broadinstitute.sting.gatk.walkers.Requires; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.MathUtils; import org.broadinstitute.sting.utils.SampleUtils; +import org.broadinstitute.sting.utils.exceptions.StingException; +import org.broadinstitute.sting.utils.vcf.VCFUtils; import java.io.PrintStream; import java.util.*; @@ -53,6 +56,7 @@ import java.util.*; public class ProduceBeagleInputWalker extends RodWalker { public static final String ROD_NAME = "variant"; + public static final String VALIDATION_ROD_NAME = "validation"; @Output(doc="File to which BEAGLE input should be written",required=true) protected PrintStream beagleWriter = null; @@ -61,9 +65,24 @@ public class ProduceBeagleInputWalker extends RodWalker { public PrintStream beagleGenotypesWriter = null; @Argument(fullName = "inserted_nocall_rate", shortName = "nc_rate", doc = "Rate (0-1) at which genotype no-calls will be randomly inserted, for testing", required = false) public double insertedNoCallRate = 0; + @Argument(fullName = "validation_genotype_ptrue", shortName = "valp", doc = "Flat probability to assign to validation genotypes. Will override GL field.", required = false) + public double validationPrior = -1.0; + @Argument(fullName = "validation_bootstrap", shortName = "bs", doc = "Proportion of records to be used in bootstrap set", required = false) + public double bootstrap = 0.0; + @Argument(fullName = "bootstrap_vcf",shortName = "bvcf", doc = "Output a VCF with the records used for bootstrapping filtered out", required = false) + VCFWriter bootstrapVCFOutput = null; + + @Hidden + @Argument(fullName = "variant_genotype_ptrue", shortName = "varp", doc = "Flat probability prior to assign to variant (not validation) genotypes. Does not override GL field.", required = false) + public double variantPrior = 0.96; + private Set samples = null; + private Set BOOTSTRAP_FILTER = new HashSet( Arrays.asList("bootstrap") ); private Random generator; + private int bootstrapCount = -1; + private int bootstrapSetSize = 0; + private int testSetSize = 0; public void initialize() { generator = new Random(); @@ -77,114 +96,176 @@ public class ProduceBeagleInputWalker extends RodWalker { beagleWriter.println(); if (beagleGenotypesWriter != null) beagleGenotypesWriter.println("dummy header"); + + if ( bootstrapVCFOutput != null ) { + initializeVcfWriter(); + } } public Integer map( RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context ) { - if( tracker != null ) { + if( tracker != null ) { GenomeLoc loc = context.getLocation(); - VariantContext vc_eval; + VariantContext variant_eval; + VariantContext validation_eval; - vc_eval = tracker.getVariantContext(ref, ROD_NAME, null, loc, false); - if ( vc_eval == null || vc_eval.isFiltered() ) + variant_eval = tracker.getVariantContext(ref, ROD_NAME, null, loc, false); + validation_eval = tracker.getVariantContext(ref,VALIDATION_ROD_NAME,null,loc,false); + if ( goodSite(variant_eval,validation_eval) ) { + if ( useValidation(variant_eval,validation_eval, ref) ) { + writeBeagleOutput(validation_eval,variant_eval,true,validationPrior); + } else { + if ( goodSite(variant_eval) ) { + writeBeagleOutput(variant_eval,validation_eval,false,variantPrior); + return 1; + } else { // todo -- if the variant site is bad, validation is good, but not in bootstrap set -- what do? + return 0; + } + } + return 1; + } else { return 0; - - if (!vc_eval.isSNP()) - return 0; - - if (!vc_eval.isBiallelic()) - return 0; - - // output marker ID to Beagle input file - beagleWriter.print(String.format("%s ", VariantContextUtils.getLocation(vc_eval).toString())); - - if (beagleGenotypesWriter != null) - beagleGenotypesWriter.print(String.format("%s ", VariantContextUtils.getLocation(vc_eval).toString())); - - for (Allele allele: vc_eval.getAlleles()) { - // TODO -- check whether this is really needed by Beagle - String bglPrintString; - if (allele.isNoCall() || allele.isNull()) - bglPrintString = "0"; - else - bglPrintString = allele.toString().substring(0,1); // get rid of * in case of reference allele - - beagleWriter.print(String.format("%s ", bglPrintString)); } - - if ( !vc_eval.hasGenotypes() ) - return 0; - - Map genotypes = vc_eval.getGenotypes(); - for ( String sample : samples ) { - // use sample as key into genotypes structure - Genotype genotype = genotypes.get(sample); - if (genotype.isCalled() && genotype.hasAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) { - String[] glArray = genotype.getAttributeAsString(VCFConstants.GENOTYPE_LIKELIHOODS_KEY).split(","); - - double[] likeArray = new double[glArray.length]; - - // convert to double array so we can normalize - int k=0; - for (String gl : glArray) { - likeArray[k++] = Double.valueOf(gl); - } - - double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(likeArray); - // see if we need to randomly mask out genotype in this position. - Double d = generator.nextDouble(); - if (d > insertedNoCallRate ) { -// System.out.format("%5.4f ", d); - for (Double likeVal: normalizedLikelihoods) - beagleWriter.print(String.format("%5.4f ",likeVal)); - } - else { - // we are masking out this genotype - beagleWriter.print("0.33 0.33 0.33 "); - } - - if (beagleGenotypesWriter != null) { - char a = genotype.getAllele(0).toString().charAt(0); - char b = genotype.getAllele(0).toString().charAt(0); - - beagleGenotypesWriter.format("%c %c ", a, b); - } - } - else if (genotype.isCalled() && !genotype.hasAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) { - // hack to deal with input VCFs with no genotype likelihoods. Just assume the called genotype - // is confident. This is useful for Hapmap and 1KG release VCFs. - double AA = 0.02; - double AB = 0.02; - double BB = 0.02; - - if (genotype.isHomRef()) { AA = 0.96; } - else if (genotype.isHet()) { AB = 0.96; } - else if (genotype.isHomVar()) { BB = 0.96; } - - beagleWriter.printf("%.2f %.2f %.2f ", AA, AB, BB); - - if (beagleGenotypesWriter != null) { - char a = genotype.getAllele(0).toString().charAt(0); - char b = genotype.getAllele(0).toString().charAt(0); - - beagleGenotypesWriter.format("%c %c ", a, b); - } - } - else { - beagleWriter.print("0.33 0.33 0.33 "); // write 1/3 likelihoods for uncalled genotypes. - if (beagleGenotypesWriter != null) - beagleGenotypesWriter.print(". . "); - } - } - - beagleWriter.println(); - if (beagleGenotypesWriter != null) - beagleGenotypesWriter.println(); + } else { + return 0; } - return 1; - } + public boolean goodSite(VariantContext a, VariantContext b) { + return goodSite(a) || goodSite(b); + } + + public boolean goodSite(VariantContext v) { + return v != null && ! v.isFiltered() && v.isSNP() && v.isBiallelic() && v.hasGenotypes(); + } + + public boolean useValidation(VariantContext variant, VariantContext validation, ReferenceContext ref) { + if( goodSite(validation) ) { + // if using record keeps us below expected proportion, use it + logger.debug(String.format("boot: %d, test: %d, total: %d", bootstrapSetSize, testSetSize, bootstrapSetSize+testSetSize+1)); + if ( (bootstrapSetSize+1.0)/(1.0+bootstrapSetSize+testSetSize) <= bootstrap ) { + if ( bootstrapVCFOutput != null ) { + bootstrapVCFOutput.add(VariantContext.modifyFilters(validation, BOOTSTRAP_FILTER), ref.getBase() ); + } + bootstrapSetSize ++; + return true; + } else { + if ( bootstrapVCFOutput != null ) { + bootstrapVCFOutput.add(validation,ref.getBase()); + } + testSetSize++; + return false; + } + } else { + if ( validation != null && bootstrapVCFOutput != null ) { + bootstrapVCFOutput.add(validation,ref.getBase()); + } + return false; + } + } + + public void writeBeagleOutput(VariantContext preferredVC, VariantContext otherVC, boolean isValidationSite, double prior) { + beagleWriter.print(String.format("%s ",VariantContextUtils.getLocation(preferredVC).toString())); + if ( beagleGenotypesWriter != null ) { + beagleGenotypesWriter.print(String.format("%s ",VariantContextUtils.getLocation(preferredVC).toString())); + } + + for ( Allele allele : preferredVC.getAlleles() ) { + String bglPrintString; + if (allele.isNoCall() || allele.isNull()) + bglPrintString = "0"; + else + bglPrintString = allele.toString().substring(0,1); // get rid of * in case of reference allele + + beagleWriter.print(String.format("%s ", bglPrintString)); + } + + Map preferredGenotypes = preferredVC.getGenotypes(); + Map otherGenotypes = goodSite(otherVC) ? otherVC.getGenotypes() : null; + boolean isValidation; + for ( String sample : samples ) { + Genotype genotype; + // use sample as key into genotypes structure + if ( preferredGenotypes.keySet().contains(sample) ) { + genotype = preferredGenotypes.get(sample); + isValidation = isValidationSite; + } else if ( otherGenotypes != null && otherGenotypes.keySet().contains(sample) ) { + genotype = otherGenotypes.get(sample); + isValidation = ! isValidationSite; + } else { + // there is magically no genotype for this sample. + throw new StingException("Sample "+sample+" arose with no genotype in variant or validation VCF. This should never happen."); + } + /** + * Use likelihoods if: is validation, prior is negative; or: is not validation, has genotype key + */ + if ( (isValidation && prior < 0.0) || genotype.isCalled() && genotype.hasAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) { + String[] glArray = genotype.getAttributeAsString(VCFConstants.GENOTYPE_LIKELIHOODS_KEY).split(","); + + double[] likeArray = new double[glArray.length]; + + // convert to double array so we can normalize + int k=0; + for (String gl : glArray) { + likeArray[k++] = Double.valueOf(gl); + } + + double[] normalizedLikelihoods = MathUtils.normalizeFromLog10(likeArray); + // see if we need to randomly mask out genotype in this position. + Double d = generator.nextDouble(); + if (d > insertedNoCallRate ) { +// System.out.format("%5.4f ", d); + for (Double likeVal: normalizedLikelihoods) + beagleWriter.print(String.format("%5.4f ",likeVal)); + } + else { + // we are masking out this genotype + beagleWriter.print("0.33 0.33 0.33 "); + } + + if (beagleGenotypesWriter != null) { + char a = genotype.getAllele(0).toString().charAt(0); + char b = genotype.getAllele(0).toString().charAt(0); + + beagleGenotypesWriter.format("%c %c ", a, b); + } + } + /** + * otherwise, use the prior uniformly + */ + else if (! isValidation && genotype.isCalled() && !genotype.hasAttribute(VCFConstants.GENOTYPE_LIKELIHOODS_KEY)) { + // hack to deal with input VCFs with no genotype likelihoods. Just assume the called genotype + // is confident. This is useful for Hapmap and 1KG release VCFs. + double AA = (1.0-prior)/2.0; + double AB = (1.0-prior)/2.0; + double BB = (1.0-prior)/2.0; + + if (genotype.isHomRef()) { AA = prior; } + else if (genotype.isHet()) { AB = prior; } + else if (genotype.isHomVar()) { BB = prior; } + + beagleWriter.printf("%.2f %.2f %.2f ", AA, AB, BB); + + if (beagleGenotypesWriter != null) { + char a = genotype.getAllele(0).toString().charAt(0); + char b = genotype.getAllele(0).toString().charAt(0); + + beagleGenotypesWriter.format("%c %c ", a, b); + } + } + else { + beagleWriter.print("0.33 0.33 0.33 "); // write 1/3 likelihoods for uncalled genotypes. + if (beagleGenotypesWriter != null) + beagleGenotypesWriter.print(". . "); + } + } + + beagleWriter.println(); + if (beagleGenotypesWriter != null) + beagleGenotypesWriter.println(); + } + + public Integer reduceInit() { return 0; // Nothing to do here } @@ -195,6 +276,19 @@ public class ProduceBeagleInputWalker extends RodWalker { public void onTraversalDone( Integer sum ) { - } - + } + + private void initializeVcfWriter() { + + final ArrayList inputNames = new ArrayList(); + inputNames.add( VALIDATION_ROD_NAME ); + + // setup the header fields + Set hInfo = new HashSet(); + hInfo.addAll(VCFUtils.getHeaderFields(getToolkit(), inputNames)); + hInfo.add(new VCFFilterHeaderLine("bootstrap","This site used for genotype bootstrapping with ProduceBeagleInputWalker")); + + bootstrapVCFOutput.writeHeader(new VCFHeader(hInfo, SampleUtils.getUniqueSamplesFromRods(getToolkit(), inputNames))); + } + } diff --git a/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java b/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java index 1ea180ed3..37173246f 100755 --- a/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java +++ b/java/test/org/broadinstitute/sting/gatk/walkers/beagle/BeagleIntegrationTest.java @@ -54,4 +54,27 @@ public class BeagleIntegrationTest extends WalkerTest { executeTest("test BeagleInput", spec); } + @Test + public void testBeagleInput2() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T ProduceBeagleInput -B:variant,VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878_HSQ_chr22_14-16m.vcf "+ + "-B:validation,VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/NA12878_OMNI_chr22_14-16m.vcf "+ + "-L 22:14000000-16000000 -o %s -bvcf %s -bs 0.8 -valp 0.98 -R /humgen/1kg/reference/human_g1k_v37.fasta -NO_HEADER ",2, + Arrays.asList("44d28b6b092d5f4c0ae59af442612ea3","481c58f8309916184a33ab1835e5cc48")); + executeTest("test BeagleInputWithBootstrap",spec); + } + + @Test + public void testBeagleOutput2() { + WalkerTestSpec spec = new WalkerTestSpec( + "-T BeagleOutputToVCF -R "+hg19Reference+" "+ + "-B:variant,VCF /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.vcf "+ + "-B:beagleR2,beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.r2 "+ + "-B:beagleProbs,beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.gprobs.bgl "+ + "-B:beaglePhased,beagle /humgen/gsa-hpprojects/GATK/data/Validation_Data/EUR_beagle_in_test.phased.bgl "+ + "-L 20:1-70000 -o %s -NO_HEADER ",1,Arrays.asList("fd104b7da9e6c99b662c6ad746fd53f3")); + + executeTest("testBeagleChangesSitesToRef",spec); + } + } \ No newline at end of file diff --git a/scala/qscript/chartl/fullCallingPipelineV2.q b/scala/qscript/chartl/fullCallingPipelineV2.q new file mode 100755 index 000000000..d76de59d8 --- /dev/null +++ b/scala/qscript/chartl/fullCallingPipelineV2.q @@ -0,0 +1,122 @@ +import com.sun.tools.doclets.internal.toolkit.util.DocFinder.Input +import org.broadinstitute.sting.gatk.{CommandLineGATK, DownsampleType} +import java.io.File +import net.sf.picard.reference.FastaSequenceFile +import org.broadinstitute.sting.commandline.Argument +import org.broadinstitute.sting.datasources.pipeline.Pipeline +import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model +import org.broadinstitute.sting.queue.extensions.gatk._ +import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction +import org.broadinstitute.sting.queue.extensions.samtools._ +import org.broadinstitute.sting.queue.pipeline.{BamProcessing,VariantCalling} +import org.broadinstitute.sting.queue.{QException, QScript} +import collection.JavaConversions._ +import org.broadinstitute.sting.utils.yaml.YamlUtils +import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType +class fullCallingPipelineV2 extends QScript { + qscript => + + @Argument(doc="Number of cleaning jobs", shortName="cleaningJobs") + var cleaningJobs: Int = _ + + @Argument(doc="the YAML file specifying inputs, interval lists, reference sequence, etc.", shortName="Y") + var yamlFile: File = _ + + @Input(doc="path to trigger track (for UnifiedGenotyper)", shortName="trigger", required=false) + var trigger: File = _ + + @Input(doc="path to refseqTable (for GenomicAnnotator)", shortName="refseqTable") + var refseqTable: File = _ + + @Input(doc="path to Picard FixMateInformation.jar. See http://picard.sourceforge.net/ .", required=false) + var picardFixMatesJar: File = new java.io.File("/seq/software/picard/current/bin/FixMateInformation.jar") + + @Input(doc="path to GATK jar") + var gatkJar: File = _ + + @Input(doc="target Ti/Tv ratio for recalibration", shortName="titv", required=true) + var target_titv: Float = _ + + @Input(doc="per-sample downsampling level",shortName="dcov",required=false) + var downsampling_coverage = 300 + + @Input(doc="level of parallelism for UnifiedGenotyper", shortName="snpScatter", required=false) + var num_snp_scatter_jobs = 20 + + @Input(doc="level of parallelism for IndelGenotyperV2", shortName="indelScatter", required=false) + var num_indel_scatter_jobs = 5 + + @Input(doc="Skip indel-cleaning for BAM files (for testing only)", shortName="skipCleaning", required=false) + var skip_cleaning = false + + //@Input(doc="ADPR script") + //var adprScript: File = _ + + //@Input(doc="Sequencing maching name (for use by adpr)") + //var machine: String = _ + + //@Input(doc="Sequencing experiement type (for use by adpr)--Whole_Exome, Whole_Genome, or Hybrid_Selection") + //var protocol: String = _ + + private var pipeline: Pipeline = _ + + trait CommandLineGATKArgs extends CommandLineGATK { + this.intervals = qscript.pipeline.getProject.getIntervalList + this.jarFile = qscript.gatkJar + this.reference_sequence = qscript.pipeline.getProject.getReferenceFile + this.memoryLimit = Some(4) + } + + + // ------------ SETUP THE PIPELINE ----------- // + + + def script = { + pipeline = YamlUtils.load(classOf[Pipeline], qscript.yamlFile) + var callingLib: VariantCalling = new VariantCalling(qscript.yamlFile,qscript.gatkJar) + var cleaningLib: BamProcessing = new BamProcessing(qscript.yamlFile,qscript.gatkJar,qscript.picardFixMatesJar) + + val projectBase: String = qscript.pipeline.getProject.getName + val cleanedBase: String = projectBase + ".cleaned" + val uncleanedBase: String = projectBase + ".uncleaned" + + // there are commands that use all the bam files + val recalibratedSamples = qscript.pipeline.getSamples.filter(_.getBamFiles.contains("recalibrated")) + + var bamsToClean: List[(File,File)] = Nil + var recalBams: List[File] = Nil + var cleanedBams: List[File] = Nil + + for ( sample <- recalibratedSamples ) { + val bam = sample.getBamFiles.get("recalibrated") + recalBams :+= bam + if (!sample.getBamFiles.contains("cleaned")) { + sample.getBamFiles.put("cleaned", swapExt(bam,"bam","cleaned.bam")) + bamsToClean :+= (bam,sample.getBamFiles.get("cleaned")) + } + + cleanedBams :+= sample.getBamFiles.get("cleaned") + } + + if ( !qscript.skip_cleaning ) { + cleaningLib.StandardIndelRealign(bamsToClean,qscript.cleaningJobs) + } + + if (!qscript.skip_cleaning) { + endToEnd(cleanedBase, cleanedBams, lib) + } else { + endToEnd(uncleanedBase, recalBams, lib) + } + } + + + def endToEnd(base: String, bamFiles: List[File], lib: VariantCalling) = { + var recal_vcf = base+"_snps.recal.annotated.tranched.vcf" + var handfilt_vcf = base+"_snps.handfiltered.annotated.vcf" + var indel_vcf = base+"_indel_calls.vcf" + + for ( c <- lib.StandardCallingPipeline(bamFiles,indel_vcf,recal_vcf,handfilt_vcf,qscript.target_titv,qscript.refseqTable) ) { + add(c) + } + } +} diff --git a/scala/qscript/chartl/omni_bootstrap_refine.q b/scala/qscript/chartl/omni_bootstrap_refine.q new file mode 100755 index 000000000..e20fcf550 --- /dev/null +++ b/scala/qscript/chartl/omni_bootstrap_refine.q @@ -0,0 +1,124 @@ +import java.io.{FileReader, File, BufferedReader} +import net.sf.picard.reference.FastaSequenceFile +import org.broadinstitute.sting.datasources.pipeline.Pipeline +import org.broadinstitute.sting.gatk.DownsampleType +import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model +import org.broadinstitute.sting.queue.extensions.gatk._ +import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction +import org.broadinstitute.sting.queue.extensions.samtools._ +import org.broadinstitute.sting.queue.{QException, QScript} +import collection.JavaConversions._ +import org.broadinstitute.sting.utils.yaml.YamlUtils +import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType + +class omni_bootstrap_refine extends QScript { + => script + + val ref = new File("/humgen/1kg/reference/human_g1k_v37.fasta") + val gatkJar = new File("/humgen/gsa-scr1/chartl/sting/dist/GenomeAnalysisTK.jar") + + trait FooTrait extends CommandLineGATK { + this.reference_sequence = script.ref + this.jarFile = script.gatkJar + this.intervalsString :+= "20" + } + + class SampleOverlap extends CommandLineFunction { + @Input(doc="omni vcf") var omni: File = _ + @Input(doc="other vcf") var other: File = _ + @Output(doc="output sn file") var out : File = _ + + def commandLine = "%s ; %s ; %s ; %s".format(this.cmd1, this.cmd2, this.cmd3, this.cmd4) + + def cmd1 : String = { + return "head -n 500 %s | grep #CHR | cut -f10- | tr '\t' '\n' | sort > temp1".format(omni.getAbsolutePath) + } + + def cmd2 : String = { + return "head -n 500 %s | grep #CHR | cut -f10- | tr '\t' '\n'| sort > temp2".format(other.getAbsolutePath) + } + + def cmd3 : String = { + return "cat temp1 temp2 | sort | uniq -c | awk '{if($1 == 2) print $2}' > %s".format(out.getAbsolutePath) + } + + def cmd4 : String = { + return "rm temp1 temp2" + } + } + + class BeagleRefinement extends CommandLineFunction { + @Input(doc="The beagle input file") var beagleInput: File = _ + var beagleOutputBase: String = _ + var beagleMemoryGigs: Int = 4 + + /** + * Note: These get set + */ + @Output(doc="The beagle phased file") var beaglePhasedFile: File = _ + @Output(doc="The beagle likelihood file") var beagleLikelihoods: File = _ + @Output(doc="The beagle r2 file") var beagleRSquared: File = _ + var beagleOutputDir: String = _ + + def freezeOutputs = { + if ( beagleInput.getParent == null ) { + beagleOutputDir = "" + } else { + beagleOutputDir = beagleInput.getParent + } + beaglePhasedFile = new File(beagleOutputDir+beagleOutputBase+"."+beagleInput.getName+".phased.gz") + beagleLikelihoods = new File(beagleOutputDir+beagleOutputBase+"."+beagleInput.getName+".gprobs.gz") + beagleRSquared = new File(beagleOutputDir+beagleOutputBase+"."+beagleInput.getName+".r2") + } + + def commandLine = "java -Djava.io.tmpdir=%s -Xmx%dg -jar %s like=%s out=%s".format(beagleInput.getParent,beagleMemoryGigs,beagleJar,beagleInput.getAbsolutePath,beagleOutputBase) + } + + def runme(pop: File, omni: File, base: String) : List[CommandLineFunction] = { + var clf : List[CommandLineFunction] = Nil + val s_overlap = new File(base+"_sample_overlap.txt") + var calcOverlap = new SampleOverlap + calcOverlap.omni = omni + calcOverlap.other = pop + calcOverlap.out = s_overlap + + clf += calcOverlap + + val subset_omni = swapExt(omni,".vcf","_%s_subset.vcf".format(base)) + val subset_pop = swapExt(pop,".vcf","_%s_subset.vcf".format(base)) + + var omSubset = new SelectVariants with FooTrait + omSubset.variantVCF = omni + omSubset.sample = s_overlap.getAbsolutePath + omSubset.out = subset_omni + + clf += omSubset + + var popSubset = new SelectVariants with FooTrait + popSubset.variantVCF = pop + popSubset.sample = s_overlap.getAbsolutePath + popSubset.out = subset_pop + + clf += popSubset + + var bootRefine = new ProduceBeagleInput with FooTrait + bootRefine.variantVCF = popSubset + bootRefine.rodBind :+= new RodBind("validation","VCF",subset_omni) + bootRefine.bootstrap_vcf = swapExt(subset_omni,".vcf",".boot.vcf") + bootRefine.bootstrap = Some(2) + bootRefine.validation_genotype_ptrue = Some(0.98); + bootRefine.out = new File(base+".beagle_input.beagle") + + clf += bootRefine + + var runBeagle = new BeagleRefinement + runBeagle.beagleInput = bootRefine.out + runBeagle.beagleOutputBase = base+"_beagle" + runBeagle.freezeOutputs() + + clf += runBeagle + + var putBack = new BeagleOutputToVCF with FooTrait + putBack. + } +} \ No newline at end of file diff --git a/scala/qscript/chartl/omni_qc.q b/scala/qscript/chartl/omni_qc.q index cae5d77db..39fc1a010 100755 --- a/scala/qscript/chartl/omni_qc.q +++ b/scala/qscript/chartl/omni_qc.q @@ -14,22 +14,28 @@ import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType class omni_qc extends QScript { qscript => - var pilot3_overlap_samples = new File("/humgen/gsa-scr1/chartl/projects/omni/resources/pilot3_overlap.txt") - var pilot1_overlap_samples = new File("/humgen/gsa-scr1/chartl/projects/omni/resources/pilot1_overlap.txt") - var production_overlap_samples = new File("/humgen/gsa-scr1/chartl/projects/omni/resources/production_overlap.txt") - var hiseq_overlap_samples = new File("/humgen/gsa-scr1/chartl/projects/omni/resources/NA12878_name.txt") + // NON-OMNI VCF FILES + var pilot3_release_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/pilot3/merge_release/ALL.exon.2010_03.genotypes.vcf","vcf") + var pilot1_ceu_vcf = new TaggedFile("/humgen/1kg/releases/pilot_project/2010_07/low_coverage/snps/CEU.low_coverage.2010_07.genotypes.vcf.gz","vcf") + var pilot1_chb_vcf = new TaggedFile("/humgen/1kg/releases/pilot_project/2010_07/low_coverage/snps/CHBJPT.low_coverage.2010_07.genotypes.vcf.gz","vcf") + var pilot1_yri_vcf = new TaggedFile("/humgen/1kg/releases/pilot_project/2010_07/low_coverage/snps/YRI.low_coverage.2010_07.genotypes.vcf.gz","vcf") + var august_calls_vcf = null + var hiseq_calls_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/NA12878.HiSeq.v9.b36.vcf.gz","vcf") + var pilot1_with_na12878_vcf = new TaggedFile("/humgen/1kg/analysis/bamsForDataProcessingPapers/lowpass_b36/calls/v2/N60/lowpass.N60.recal.mG6.retranche.vcf","vcf") + // OMNI VCF FILES + var OMNI_b36_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/Omni_2_5_795_b36.vcf","vcf") + var OMNI_b37_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/Omni_2_5_795_b37.deduped.vcf.gz","vcf") + var OMNI_hapmap_b36_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/Omni_2_5_pilot_b36.vcf.gz","vcf") + + // INTERVALS var pilot3_interval_list: String = "/humgen/gsa-hpprojects/1kg/1kg_pilot3/documents/CenterSpecificTargetLists/results/p3overlap.targets.b36.interval_list" var pilot1_interval_list: String = _ var hiseq_interval_list: String = _ var production_interval_list: String = "20" - val Omni_chip_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/Omni_2_5_603samples.vcf.gz","vcf") - val Omni_b37_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/Omni_2_5_603samples.b37.vcf.gz","vcf") - val pilot3_vcf_broad = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/ALL.exon.2010_05.broad.genotypes.vcf.gz","vcf") - val pilot1_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/CEU.low_coverage.2010_07.genotypes.vcf.gz","vcf") - val production_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/ALL.production.2010_08.chr20.lowpass.genotypes.vcf","vcf") - val hiSeq_vcf = new TaggedFile("/humgen/gsa-scr1/chartl/projects/omni/resources/NA12878.HiSeq.v9.b36.vcf.gz","vcf") + // OTHER + var failed_qc_samples = new File("/humgen/gsa-scr1/chartl/projects/omni/resources_oct7/failed_qc_samples.txt") trait OmniArgs extends CommandLineGATK { this.reference_sequence = new File("/humgen/gsa-hpprojects/1kg/reference/human_b36_both.fasta") @@ -80,6 +86,10 @@ class omni_qc extends QScript { eval.reportLocation = new TaggedFile(outDir+base+".subset.omni.eval","eval") eval.reportType = Some(VE2TemplateType.R) eval.analysisName = base+"_Eval" + eval.select ++= List("AC==1","AC==2","AC==3","AC==4","AC==5","AC>5") + eval.selectName ++= List("AC1","AC2","AC3","AC4","AC5","AC6+") + eval.disI = true + eval.outputVCF = outDir+base+".eval.interesting.sites.vcf" if ( base.equalsIgnoreCase("production") ) { subset.variantVCF = Omni_b37_vcf @@ -88,6 +98,8 @@ class omni_qc extends QScript { eval.DBSNP = new File("/humgen/gsa-hpprojects/GATK/data/dbsnp_129_b37.rod") } + + add(subset,makeOmniSiteList,eval) } @@ -96,5 +108,6 @@ class omni_qc extends QScript { runme(pilot3_overlap_samples,pilot3_interval_list,pilot3_vcf_broad,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/pilot3/","pilot3") runme(hiseq_overlap_samples,hiseq_interval_list,hiSeq_vcf,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/hiSeq/","NA12878_HiSeq") runme(production_overlap_samples,production_interval_list,production_vcf,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/production/","production") + runme(pilot3_overlap_samples,pilot3_interval_list,pilot3_vcf,"/humgen/gsa-scr1/chartl/projects/omni/scratch/queue/pilot3/","pilot3_release") } -} \ No newline at end of file +} diff --git a/scala/src/org/broadinstitute/sting/queue/QScript.scala b/scala/src/org/broadinstitute/sting/queue/QScript.scala index a795f664c..792e1c5e2 100755 --- a/scala/src/org/broadinstitute/sting/queue/QScript.scala +++ b/scala/src/org/broadinstitute/sting/queue/QScript.scala @@ -38,4 +38,5 @@ trait QScript extends Logging { * Adds one or more command line functions to be run. */ def add(functions: CommandLineFunction*) = this.functions ++= List(functions:_*) + } diff --git a/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala b/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala index 564b65941..2b9e5528f 100644 --- a/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala +++ b/scala/src/org/broadinstitute/sting/queue/function/CommandLineFunction.scala @@ -12,6 +12,9 @@ import org.broadinstitute.sting.queue.function.scattergather.{SimpleTextGatherFu trait CommandLineFunction extends QFunction with Logging { def commandLine: String + /* Is it a gather function? */ + var isGather: Boolean = true + /** Upper memory limit */ var memoryLimit: Option[Int] = None diff --git a/scala/src/org/broadinstitute/sting/queue/pipeline/BamProcessing.scala b/scala/src/org/broadinstitute/sting/queue/pipeline/BamProcessing.scala new file mode 100755 index 000000000..15c0183ac --- /dev/null +++ b/scala/src/org/broadinstitute/sting/queue/pipeline/BamProcessing.scala @@ -0,0 +1,138 @@ +package org.broadinstitute.sting.queue.pipeline + +import org.broadinstitute.sting.queue.extensions.gatk._ +import java.io.File +import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction +import org.broadinstitute.sting.queue.function.CommandLineFunction +import org.broadinstitute.sting.utils.yaml.YamlUtils +import org.broadinstitute.sting.datasources.pipeline.Pipeline +import net.sf.picard.reference.ReferenceSequenceFileFactory +import org.broadinstitute.sting.utils.{GenomeLoc, GenomeLocParser} +import org.broadinstitute.sting.utils.interval.IntervalUtils +import collection.mutable.{ListBuffer, HashMap} +import collection.JavaConversions +import java.util.Arrays +import org.broadinstitute.sting.queue.util.{PipelineUtils, IOUtils} +import org.broadinstitute.sting.commandline.{Output, Input} +import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction + +class BamProcessing(yaml: File, gatkJar: File, fixMatesJar: File) { + library => + + var attributes: Pipeline = YamlUtils.load(classOf[Pipeline],yaml) + + trait StandardCommandLineGATK extends CommandLineGATK { + this.reference_sequence = library.attributes.getProject.getReferenceFile + this.intervals = library.attributes.getProject.getIntervalList + this.DBSNP = library.attributes.getProject.getDbsnpFile + this.memoryLimit = Some(2) + this.jarFile = library.gatkJar + } + + /** + * @Doc: Creates a standard realigner target creator CLF given a bam, an output file, and the contigs over which to run + * @Returns: A CLF for the realigner target creator job + */ + protected def StandardRealignerTargetCreator(bam: File, contigs: List[String], output: File) : RealignerTargetCreator = { + var rtc = new RealignerTargetCreator with StandardCommandLineGATK + rtc.intervals = null + rtc.intervalsString = contigs + rtc.input_file :+= bam + rtc.out = output + rtc.analysisName = "RealignerTargetCreator" + + return rtc + } + + /** + * @Doc: Creates a standard indel cleaner CLF given a bam, the results of the target creator, and an output .bam file + * @Returns: A CLF for the indel cleaning job + */ + protected def StandardIndelCleaner(bam: File, contigs: List[String], targets: File, outBam: File) : IndelRealigner = { + var realigner = new IndelRealigner with StandardCommandLineGATK + realigner.intervalsString = contigs + realigner.intervals = null + realigner.input_file :+= bam + realigner.out = outBam + realigner.targetIntervals = targets + realigner.analysisName = "IndelClean" + realigner.bam_compression = Some(0) + + return realigner + } + + /** + * @Doc: Creates a standard split-by-contig indel cleaner job for a given bam file, RTC output, and bam to merge everything to + * @Returns: A list of CLFs (todo -- wrapped in a Pipeline) + */ + protected def StandardIndelCleanBam(bam: File, jobContigs: List[List[String]], targets: File, cleanedBam: File) : List[CommandLineFunction] = { + var cmds : List[CommandLineFunction] = Nil + var jobSpecs : List[(File,File,List[String])] = jobContigs.map[(File,File,List[String]),List[(File,File,List[String])]]( + ctigs => { (bam, swapExt(bam,".bam",".%s.bam".format(ctigs.mkString("_"))), ctigs) } + ) + var bamsToMerge : List[File] = Nil + for ( spec <- jobSpecs ) { + cmds :+= StandardIndelCleaner(spec._1,spec._3,targets,spec._2) + bamsToMerge :+= spec._2 + } + + cmds :+= StandardPicardFixMates(bamsToMerge,cleanedBam,library.fixMatesJar) + + return cmds + + } + + /** + * @Doc: Given a list of (pairs of) bams and cleaned bams to write to, and a number of jobs, creates a set of + * command line functions to do the target-creating, splitting, cleaning, and merging, returning that list + * of command line functions + * @Returns: A list of command line functions for the full indel realignment pipeline from the collection + * of uncleaned bams to the collection of cleaned bams + */ + protected def StandardIndelRealign( bamsUncleanCleanPairs: List[(File,File)], nJobs: Int = 1 ) : List[CommandLineFunction] = { + val contigsForJobs : List[List[String]] = PipelineUtils.smartSplitContigs(library.attributes.getProject.getReferenceFile, library.attributes.getProject.getIntervalList, nJobs) + var commands : List[CommandLineFunction] = Nil + for ( bamPair <- bamsUncleanCleanPairs ) { + val rtc : RealignerTargetCreator = StandardRealignerTargetCreator(bamPair._1,contigsForJobs.foldLeft[List[String]](Nil)( (a,b) => a ::: b), swapExt(bamPair._1,".bam",".targets") ) + val icbs : List[CommandLineFunction] = StandardIndelCleanBam(bamPair._1,contigsForJobs,rtc.out,bamPair._2) + val sam : SamtoolsIndexFunction = new SamtoolsIndexFunction + sam.bamFile = bamPair._2 + sam.analysisName = "SamtoolsIndex" + commands :+= rtc + commands ++= icbs + commands :+= sam + } + + return commands + } + + /** + * @Doc: Merges N bam files into one bam file, fixing mate pairs in the process; does not assume they are sorted + * @Returns: Command line function for the merge, fix-mate, and sort operation + */ + protected def StandardPicardFixMates(inBams: List[File], outBam: File, picardJar: File) : CommandLineFunction = { + var pfm : PicardFixMates = new PicardFixMates + pfm.bams = inBams + pfm.outBam = outBam + pfm.jarFile = picardJar + pfm.assumeSorted = Some(false) + pfm.memoryLimit = Some(4) + pfm.analysisName = "FixMates" + + return pfm + } + + class PicardFixMates extends PicardBamJarFunction { + @Input(doc="input bam files") var bams: List[File] = Nil + @Output(doc="output bam file") var outBam: File = null + + def inputBams: List[File] = bams + def outputBam: File = outBam + + } + + + def swapExt(file: File, oldExtension: String, newExtension: String) = + new File(file.getName.stripSuffix(oldExtension) + newExtension) + +} diff --git a/scala/src/org/broadinstitute/sting/queue/pipeline/QPipeline.scala b/scala/src/org/broadinstitute/sting/queue/pipeline/QPipeline.scala new file mode 100755 index 000000000..8cbff5f94 --- /dev/null +++ b/scala/src/org/broadinstitute/sting/queue/pipeline/QPipeline.scala @@ -0,0 +1,13 @@ +package org.broadinstitute.sting.queue.pipeline + +import org.broadinstitute.sting.queue.function.CommandLineFunction + +trait QPipeline { + var commands: List[CommandLineFunction] = Nil + + def generateCommands + + def track // todo -- implement me + + def removeIntermediate // todo -- implement me +} \ No newline at end of file diff --git a/scala/src/org/broadinstitute/sting/queue/pipeline/VariantCalling.scala b/scala/src/org/broadinstitute/sting/queue/pipeline/VariantCalling.scala new file mode 100755 index 000000000..d25df3284 --- /dev/null +++ b/scala/src/org/broadinstitute/sting/queue/pipeline/VariantCalling.scala @@ -0,0 +1,340 @@ +package org.broadinstitute.sting.queue.pipeline + +import java.io.File +import net.sf.picard.reference.FastaSequenceFile +import org.broadinstitute.sting.datasources.pipeline.Pipeline +import org.broadinstitute.sting.gatk.DownsampleType +import org.broadinstitute.sting.gatk.walkers.genotyper.GenotypeCalculationModel.Model +import org.broadinstitute.sting.queue.extensions.gatk._ +import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction +import org.broadinstitute.sting.queue.extensions.samtools._ +import org.broadinstitute.sting.queue.{QException, QScript} +import collection.JavaConversions._ +import org.broadinstitute.sting.utils.yaml.YamlUtils +import org.broadinstitute.sting.utils.report.VE2ReportFactory.VE2TemplateType +class VariantCalling(yaml: File,gatkJar: File) { + vc => + + // load attributes + var attributes: Pipeline = YamlUtils.load(classOf[Pipeline], yaml) + + /** + * Trait to propagate basic attributes throughout the library + */ + trait StandardCommandLineGATK extends CommandLineGATK { + this.reference_sequence = vc.attributes.getProject.getReferenceFile + this.intervals = vc.attributes.getProject.getIntervalList + this.DBSNP = vc.attributes.getProject.getDbsnpFile + // set global memory limit on the low side. Additional input bams will affect it. + this.memoryLimit = Some(2) + this.jarFile = vc.gatkJar + } + + /** + * @Doc: Creates a standard UnifiedGenotyper CLF from input bams and an output file + * @Return: UnifiedGenotyper with the standard GSA arguments + * @TODO: Add a formula: f(#bams)=memory; allow yaml to specify triggers and perhaps other information + */ + protected def StandardUnifiedGenotyper(bams : List[File], output : File) : UnifiedGenotyper = { + var ug = new UnifiedGenotyper with StandardCommandLineGATK + ug.analysisName = "UnifiedGenotyper" + ug.input_file = bams + ug.group :+= "Standard" + ug.out = output + ug.min_base_quality_score = Some(10) + ug.min_mapping_quality_score = Some(10) + ug.cap_base_quality_by_mapping_quality = true + ug.standard_min_confidence_threshold_for_emitting = Some(10) + ug.standard_min_confidence_threshold_for_calling = Some(30) + ug.trigger_min_confidence_threshold_for_calling = Some(0) + ug.downsample_to_coverage = Some(300) + ug.dt = Some(DownsampleType.BY_SAMPLE) + ug.scatterCount = 50 + + if ( bams.size > 125 ) { + ug.memoryLimit = Some(4) + } + + return ug + + } + + /** + * @Doc: Creates a CLF to call indels on a specific .bam file, outputting to a given output file + * @Returns: An IndelGenotyperV2 CLF with standard GSA arguments + */ + protected def StandardIndelGenotyper(bam : File, output: File) : IndelGenotyperV2 = { + var ig = new IndelGenotyperV2 with StandardCommandLineGATK + ig.analysisName = "IndelGenotyper" + ig.input_file :+= bam + ig.out = output + ig.downsample_to_coverage = Some(300) + return ig + } + + /** + * @Doc: Accessor method to StandardIndelGenotyper that allows it to be marked as a scatter job in a pipeline + * @Returns: An IndelGenotyperV2 CLF with standard GSA arguments, marked as a scatter job + */ + private def StandardIndelGenotyper(bam : File, output: File, gather: Boolean) : IndelGenotyperV2 = { + var ig = StandardIndelGenotyper(bam,output) + ig.isGather = gather + return ig + } + + /** + * @Doc: Combines a list of indel VCFs to a single output file + * @Returns: A CombineVariants CLF with standard GSA arguments + */ + protected def StandardIndelCombine( igList : List[IndelGenotyperV2], output : File ) : CombineVariants = { + var cv = new CombineVariants with StandardCommandLineGATK + cv.out = output + cv.genotypemergeoption = Some(org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.GenotypeMergeType.UNIQUIFY) + cv.variantmergeoption = Some(org.broadinstitute.sting.gatk.contexts.variantcontext.VariantContextUtils.VariantMergeType.UNION) + cv.analysisName = "IndelGenotyper" + cv.isGather = true + cv.priority = (igList.map[String,List[String]](ig => swapExt(ig.out,".vcf","").getAbsolutePath)).mkString(",") + //cv.priority = (igList.foldLeft[List[String]](Nil)( (prLs, ig) => prLs ::: List(swapExt(ig.out,".vcf","").getAbsolutePath))).mkString(",") + cv.rodBind = igList.map[RodBind,List[RodBind]](ig => new RodBind(swapExt(ig.out,".vcf","").getName,"VCF",ig.out)) + + return cv + + } + + /** + * @Doc: Generates indel calls on a list of .bam files, and merges those calls into an output file. This is a small pipeline. + * @Returns: A list of CLFs that run indel calls and indel merging. User has zero control over individual indel VCF names. + */ + protected def StandardIndelCalls ( bams : List[File], output : File ) : List[CommandLineGATK] = { + var genotypers = bams.foldLeft[List[IndelGenotyperV2]](Nil)( (igs,bam) => igs ::: List(this.StandardIndelGenotyper(bam, swapExt(bam,".bam",".indels.vcf"), false))) + var combine = this.StandardIndelCombine( genotypers, output ) + var callFunctions: List[CommandLineGATK] = genotypers + callFunctions = combine :: callFunctions + + return callFunctions + } + + protected def StandardFilterAtIndels ( snps: File, indels: File, output : File ) : VariantFiltration = { + var iFil = new VariantFiltration with StandardCommandLineGATK + iFil.analysisName = "FilterAtIndels" + iFil.out = output + iFil.mask = indels.getAbsolutePath + iFil.maskName = "Indel_Mask" + iFil.variantVCF = snps + // todo -- cluster size varies with # bams + iFil.clusterSize = Some(5) + iFil.clusterWindowSize = Some(8) + + return iFil + } + + protected def StandardHandfilter( snps: File, output: File ) : VariantFiltration = { + var hFil = new VariantFiltration with StandardCommandLineGATK + hFil.analysisName = "HandFilter" + hFil.out = output + hFil.variantVCF = snps + hFil.filterExpression :+= "QD<5" + hFil.filterName :+= "LowQualByDepth" + hFil.filterExpression :+= "AB>0.75" + hFil.filterName :+= "HighAlleleBalance" + hFil.filterExpression :+= "SB>-0.10" + hFil.filterName :+= "HighStrandBias" + + return hFil + } + + protected def StandardVariantCluster( snps: File, output: File ) : GenerateVariantClusters = { + var genC = new GenerateVariantClusters with StandardCommandLineGATK + genC.analysisName = "VariantQualityRecalibration" + genC.rodBind :+= new RodBind("input","VCF",snps) + genC.cluster_file = output + genC.use_annotation :+= "QD" + genC.use_annotation :+= "SB" + genC.use_annotation :+= "HaplotypeScore" + genC.use_annotation :+= "AB" + genC.use_annotation :+= "HRun" + + return genC + } + + protected def StandardVariantRecalibrator ( raw_vcf: File, cluster: File, target_titv: scala.Double, out_vcf: File, + out_tranches: File, out_dat: File) : VariantRecalibrator = { + var vr = new VariantRecalibrator with StandardCommandLineGATK + vr.analysisName = "VariantQualityRecalibration" + vr.rodBind :+= new RodBind("input","VCF",raw_vcf) + vr.cluster_file = cluster + vr.target_titv = target_titv + vr.out = out_vcf + vr.tranches_file = out_tranches + vr.report_dat_file = out_dat + vr.tranche :+= "0.1" + vr.tranche :+= "1" + vr.tranche :+= "5" + vr.tranche :+= "10" + + return vr + + } + + protected def StandardApplyVariantCuts( snpRecal: File, tranches: File, output: File) : ApplyVariantCuts = { + var avc = new ApplyVariantCuts with StandardCommandLineGATK + avc.analysisName = "VariantQualityRecalibration" + avc.rodBind :+= new RodBind("input","VCF",snpRecal) + avc.out = output + avc.tranches_file = tranches + avc.fdr_filter_level = Some(5) + + return avc + } + + protected def StandardRecalibrateVariants( snps: File, targetTiTv: scala.Double, recalVcf: File) : List[CommandLineGATK] = { + var clust = StandardVariantCluster(snps, swapExt(snps,".vcf",".cluster")) + var recal = StandardVariantRecalibrator(snps,clust.clusterFile,targetTiTv,swapExt(snps,".vcf",".recal.vcf"), + swapExt(snps,".vcf",".recal.tranch"),swapExt(snps,".vcf",".recal.dat")) + var cut = StandardApplyVariantCuts(recal.out,recal.tranches_file,recal.out) + + var cmds: List[CommandLineGATK] = Nil + cmds :+= clust + cmds :+= recal + cmds :+= cut + + return cmds + } + + protected def StandardGenomicAnnotation ( snps: File, refseqFile: File, outputVCF: File) : GenomicAnnotator = { + var ga = new GenomicAnnotator with StandardCommandLineGATK + ga.analysisName = "GenomicAnnotator" + ga.variantVCF = snps + ga.rodBind :+= new RodBind("refseq","AnnotatorInputTable",refseqFile) + ga.rodToIntervalTrackName = "variant" + ga.out = outputVCF + + return ga + } + + protected def StandardSNPCalls( bams: List[File], output: File, targetTiTv: scala.Double, refGene: File = null ) : List[CommandLineGATK] = { + var commands : List[CommandLineGATK] = Nil + var dir = "" + + if ( output.getParent != null ) { + dir = output.getParent+"/" + } + + var raw_snp = new File(dir+vc.attributes.getProject+".raw_snps.vcf") + var ug = StandardUnifiedGenotyper(bams, raw_snp) + + commands :+= ug + + var raw_indel = new File(dir+vc.attributes.getProject+".raw_indels.vcf") + var ig = StandardIndelCalls(bams,raw_indel) + + commands ++= ig + + var prefilt_snp = swapExt(raw_snp,".vcf",".indel_filtered.vcf") + var iFilt = StandardFilterAtIndels(raw_snp,raw_indel,prefilt_snp) + + commands :+= iFilt + + var annoSNP = prefilt_snp + if ( refGene != null ) { + annoSNP = swapExt(prefilt_snp,".vcf",".annotated.vcf") + var annotate = StandardGenomicAnnotation(prefilt_snp, refGene, annoSNP) + + commands :+= annotate + } + + var recal = StandardRecalibrateVariants(annoSNP, targetTiTv, output) + + commands ++= recal + + return commands + } + + protected def StandardSNPCallsBothFilterTypes(bams: List[File], recalOut: File, handFilteredOut: File, targetTiTv: scala.Double, refGene: File = null ) : List[CommandLineGATK] = { + var commands : List[CommandLineGATK] = Nil + var dir = "" + + if ( recalOut.getParent != null ) { + dir = recalOut.getParent+"/" + } + + var raw_snp = new File(dir+vc.attributes.getProject+".raw_snps.vcf") + var ug = StandardUnifiedGenotyper(bams, raw_snp) + + commands :+= ug + + var raw_indel = new File(dir+vc.attributes.getProject+".raw_indels.vcf") + var ig = StandardIndelCalls(bams,raw_indel) + + commands ++= ig + + var prefilt_snp = swapExt(raw_snp,".vcf",".indel_filtered.vcf") + var iFilt = StandardFilterAtIndels(raw_snp,raw_indel,prefilt_snp) + + commands :+= iFilt + + var annoSNP = prefilt_snp + if ( refGene != null ) { + annoSNP = swapExt(prefilt_snp,".vcf",".annotated.vcf") + var annotate = StandardGenomicAnnotation(prefilt_snp, refGene, annoSNP) + + commands :+= annotate + } + + var recal = StandardRecalibrateVariants(annoSNP, targetTiTv, recalOut) + + commands ++= recal + + var handFilt = StandardHandfilter(prefilt_snp,handFilteredOut) + + commands :+= handFilt + + return commands + } + + protected def StandardCallingPipeline(bams: List[File], indelOut: File, recalOut: File, handFilteredOut: File, targetTiTv: scala.Double, refGene: File = null ) : List[CommandLineGATK] = { + var commands : List[CommandLineGATK] = Nil + var dir = "" + + if ( recalOut.getParent != null ) { + dir = recalOut.getParent+"/" + } + + var raw_snp = new File(dir+vc.attributes.getProject+".raw_snps.vcf") + var ug = StandardUnifiedGenotyper(bams, raw_snp) + + commands :+= ug + + var raw_indel = indelOut + var ig = StandardIndelCalls(bams,raw_indel) + + commands ++= ig + + var prefilt_snp = swapExt(raw_snp,".vcf",".indel_filtered.vcf") + var iFilt = StandardFilterAtIndels(raw_snp,raw_indel,prefilt_snp) + + commands :+= iFilt + + var annoSNP = prefilt_snp + if ( refGene != null ) { + annoSNP = swapExt(prefilt_snp,".vcf",".annotated.vcf") + var annotate = StandardGenomicAnnotation(prefilt_snp, refGene, annoSNP) + + commands :+= annotate + } + + var recal = StandardRecalibrateVariants(annoSNP, targetTiTv, recalOut) + + commands ++= recal + + var handFilt = StandardHandfilter(prefilt_snp,handFilteredOut) + + commands :+= handFilt + + return commands + } + + def swapExt(file: File, oldExtension: String, newExtension: String) = + new File(file.getName.stripSuffix(oldExtension) + newExtension) + +} \ No newline at end of file diff --git a/scala/src/org/broadinstitute/sting/queue/util/PipelineUtils.scala b/scala/src/org/broadinstitute/sting/queue/util/PipelineUtils.scala new file mode 100755 index 000000000..1998b6c76 --- /dev/null +++ b/scala/src/org/broadinstitute/sting/queue/util/PipelineUtils.scala @@ -0,0 +1,55 @@ +package org.broadinstitute.sting.queue.util + +import net.sf.picard.reference.ReferenceSequenceFileFactory +import java.io.File +import org.broadinstitute.sting.utils.GenomeLocParser +import collection.JavaConversions._ +import org.broadinstitute.sting.utils.interval.IntervalUtils + +class PipelineUtils { + +} + +object PipelineUtils{ + + def smartSplitContigs(reference: File, intervals: File, sets: Int) : List[List[String]] = { + GenomeLocParser.setupRefContigOrdering(ReferenceSequenceFileFactory.getReferenceSequenceFile(reference)) + val targets = IntervalUtils.parseIntervalArguments(List(intervals.getAbsolutePath), false) + + // Build up a map of contigs with sizes. + var contigSizes = Map.empty[String, Long] + // todo -- make this look like functional code for Q's sake + //targets.foreach( loc => { contigSizes += loc -> { contigSizes.get(loc.getContig) match { case Some(size) => size + loc.size case None => loc.size } } }) + + for (loc <- targets) { + val contig = loc.getContig + val contigSize = loc.size + contigSizes += contig -> { + contigSizes.get(contig) match { + case Some(size) => size + contigSize + case None => contigSize + } + } + } + + // Keep a list of pairs of sizes with lists of contigs. + var splitContigs = List.empty[(Long, List[String])] + for ((contigName, contigSize) <- contigSizes) { + if (splitContigs.size < sets) { + // If there are fewer than the requested number of sets, just add this contig. + splitContigs :+= contigSize -> List(contigName) + } else { + // If there is already a number of sets + // sort the contigs to get the smallest one first. + splitContigs = splitContigs.sortBy{case (size, contigs) => size} + // Update the pair with the new contig size and name. + var smallContigs = splitContigs.head + smallContigs = (smallContigs._1 + contigSize) -> (smallContigs._2 :+ contigName) + // Re add the pair to the list. + splitContigs = smallContigs :: splitContigs.tail + } + } + + splitContigs.map{case (size, contigs) => contigs} + } +} \ No newline at end of file