From 41c8552d0acdb2bd29f337af56fff9900c69c8ff Mon Sep 17 00:00:00 2001 From: depristo Date: Thu, 20 Jan 2011 12:54:03 +0000 Subject: [PATCH] Added implements HasGenomeLocation to all revelant classes. It's not possible to write generic code for working with objects that support the getLocation() function in HasGenomeLocation. Please, if you have an object that has a location, implement this interface and start using / writing generic functions to sort, compare, etc. these objects. git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5031 348d0f76-0448-11de-a6fe-93d51630548a --- .../sting/gatk/contexts/AlignmentContext.java | 3 +- .../contexts/StratifiedAlignmentContext.java | 3 +- .../simpleDataSources/ResourcePool.java | 3 +- .../sting/gatk/refdata/RODRecordListImpl.java | 3 +- .../gatk/refdata/ReferenceOrderedDatum.java | 3 +- .../sting/gatk/refdata/Transcript.java | 3 +- .../gatk/refdata/utils/FlashBackIterator.java | 3 +- .../sting/gatk/refdata/utils/GATKFeature.java | 3 +- .../gatk/refdata/utils/RODRecordList.java | 3 +- .../walkers/coverage/CallableLociWalker.java | 2 +- .../gatk/walkers/indels/IndelRealigner.java | 2 +- .../phasing/ReadBackedPhasingWalker.java | 2 +- .../broadinstitute/sting/utils/GenomeLoc.java | 7 +- .../pileup/AbstractReadBackedPileup.java | 1 + .../sting/utils/pileup/ReadBackedPileup.java | 3 +- .../providers/LocusViewTemplate.java | 2 +- .../datasources/shards/MockLocusShard.java | 5 +- .../depristo/distributedGATKPerformance.scala | 153 ++++++++++++++++++ 18 files changed, 187 insertions(+), 17 deletions(-) create mode 100755 scala/qscript/oneoffs/depristo/distributedGATKPerformance.scala diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java b/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java index 255945619..337c2664c 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/AlignmentContext.java @@ -26,6 +26,7 @@ package org.broadinstitute.sting.gatk.contexts; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.HasGenomeLocation; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.pileup.ReadBackedPileup; @@ -42,7 +43,7 @@ import java.util.*; * Time: 3:01:34 PM * To change this template use File | Settings | File Templates. */ -public class AlignmentContext { +public class AlignmentContext implements HasGenomeLocation { protected GenomeLoc loc = null; protected ReadBackedPileup basePileup = null; protected boolean hasPileupBeenDownsampled; diff --git a/java/src/org/broadinstitute/sting/gatk/contexts/StratifiedAlignmentContext.java b/java/src/org/broadinstitute/sting/gatk/contexts/StratifiedAlignmentContext.java index 046d874b7..9f1bb7984 100755 --- a/java/src/org/broadinstitute/sting/gatk/contexts/StratifiedAlignmentContext.java +++ b/java/src/org/broadinstitute/sting/gatk/contexts/StratifiedAlignmentContext.java @@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.contexts; import org.broadinstitute.sting.gatk.GenomeAnalysisEngine; import org.broadinstitute.sting.gatk.datasources.sample.Sample; +import org.broadinstitute.sting.utils.HasGenomeLocation; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -39,7 +40,7 @@ import java.util.*; * User: ebanks * Modified: chartl (split by read group) */ -public class StratifiedAlignmentContext { +public class StratifiedAlignmentContext implements HasGenomeLocation { // Definitions: // COMPLETE = full alignment context diff --git a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ResourcePool.java b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ResourcePool.java index 22f00ac67..08bd8869f 100755 --- a/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ResourcePool.java +++ b/java/src/org/broadinstitute/sting/gatk/datasources/simpleDataSources/ResourcePool.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.datasources.simpleDataSources; import net.sf.samtools.SAMSequenceDictionary; +import org.broadinstitute.sting.utils.HasGenomeLocation; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; @@ -189,7 +190,7 @@ class EntireStream implements DataStreamSegment { /** * Models a mapped position within a stream of GATK input data. */ -class MappedStreamSegment implements DataStreamSegment { +class MappedStreamSegment implements DataStreamSegment, HasGenomeLocation { public final GenomeLoc locus; /** diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/RODRecordListImpl.java b/java/src/org/broadinstitute/sting/gatk/refdata/RODRecordListImpl.java index 3a7b26753..cff97e4ee 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/RODRecordListImpl.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/RODRecordListImpl.java @@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.refdata; import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature; import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList; +import org.broadinstitute.sting.utils.HasGenomeLocation; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.GenomeLoc; @@ -14,7 +15,7 @@ import java.util.*; * Time: 6:10:48 PM * To change this template use File | Settings | File Templates. */ -public class RODRecordListImpl extends AbstractList implements Comparable, Cloneable, RODRecordList { +public class RODRecordListImpl extends AbstractList implements Comparable, Cloneable, RODRecordList, HasGenomeLocation { private List records; private GenomeLoc location = null; private String name = null; diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java index c2fa95090..00eaf92e0 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/ReferenceOrderedDatum.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.refdata; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.HasGenomeLocation; import java.io.File; import java.io.FileNotFoundException; @@ -13,7 +14,7 @@ import java.io.IOException; * Time: 10:49:47 AM * To change this template use File | Settings | File Templates. */ -public interface ReferenceOrderedDatum extends Comparable { +public interface ReferenceOrderedDatum extends Comparable, HasGenomeLocation { public String getName(); public boolean parseLine(final Object header, final String[] parts) throws IOException; public String toString(); diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/Transcript.java b/java/src/org/broadinstitute/sting/gatk/refdata/Transcript.java index 688ffafee..b8a0868dd 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/Transcript.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/Transcript.java @@ -1,6 +1,7 @@ package org.broadinstitute.sting.gatk.refdata; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.HasGenomeLocation; import java.util.List; @@ -11,7 +12,7 @@ import java.util.List; * Time: 5:22:30 PM * To change this template use File | Settings | File Templates. */ -public interface Transcript { +public interface Transcript extends HasGenomeLocation { /** Returns id of the transcript (RefSeq NM_* id) */ public String getTranscriptId(); diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIterator.java b/java/src/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIterator.java index dc6da7132..2b48f61a6 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIterator.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/utils/FlashBackIterator.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.refdata.utils; import net.sf.samtools.SAMSequenceDictionary; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.HasGenomeLocation; import java.util.Comparator; import java.util.LinkedList; @@ -190,7 +191,7 @@ public class FlashBackIterator implements LocationAwareSeekableRODIterator { /** * a list that buffers the location for this rod */ -class ComparableList implements Comparator { +class ComparableList implements Comparator, HasGenomeLocation { private RODRecordList list; private GenomeLoc location = null; public ComparableList(RODRecordList list) { diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/utils/GATKFeature.java b/java/src/org/broadinstitute/sting/gatk/refdata/utils/GATKFeature.java index 9453ec484..1553402a5 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/utils/GATKFeature.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/utils/GATKFeature.java @@ -27,6 +27,7 @@ import org.broad.tribble.Feature; import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.GenomeLocParser; +import org.broadinstitute.sting.utils.HasGenomeLocation; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.exceptions.UserException; @@ -40,7 +41,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException; * This wraps a Tribble feature or a RODatum so that both present the same interface: a genome loc for position and a * way of retrieving the track name. */ -public abstract class GATKFeature implements Feature { +public abstract class GATKFeature implements Feature, HasGenomeLocation { public GATKFeature(String name) { this.name = name; diff --git a/java/src/org/broadinstitute/sting/gatk/refdata/utils/RODRecordList.java b/java/src/org/broadinstitute/sting/gatk/refdata/utils/RODRecordList.java index 290e5cd5c..00c64edbd 100644 --- a/java/src/org/broadinstitute/sting/gatk/refdata/utils/RODRecordList.java +++ b/java/src/org/broadinstitute/sting/gatk/refdata/utils/RODRecordList.java @@ -24,6 +24,7 @@ package org.broadinstitute.sting.gatk.refdata.utils; import org.broadinstitute.sting.utils.GenomeLoc; +import org.broadinstitute.sting.utils.HasGenomeLocation; import java.util.List; @@ -36,7 +37,7 @@ import java.util.List; * make the RODRecord list an interface, so we can stub in other implementations * during testing. */ -public interface RODRecordList extends List, Comparable { +public interface RODRecordList extends List, Comparable, HasGenomeLocation { public GenomeLoc getLocation(); public String getName(); } diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java index 2cf640317..2c67265d6 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/coverage/CallableLociWalker.java @@ -99,7 +99,7 @@ public class CallableLociWalker extends LocusWalker { } } - private class ReadBin { + private class ReadBin implements HasGenomeLocation { private final ArrayList reads = new ArrayList(); private byte[] reference = null; diff --git a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java index 917871d37..7e2476f8d 100755 --- a/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java +++ b/java/src/org/broadinstitute/sting/gatk/walkers/phasing/ReadBackedPhasingWalker.java @@ -1086,7 +1086,7 @@ public class ReadBackedPhasingWalker extends RodWalker, Cloneable, Serializable { +public class GenomeLoc implements Comparable, Cloneable, Serializable, HasGenomeLocation { /** * the basic components of a genome loc, its contig index, * start and stop position, and (optionally) the contig name @@ -38,7 +38,8 @@ public class GenomeLoc implements Comparable, Cloneable, Serializable public static final boolean isUnmapped(GenomeLoc loc) { return loc == UNMAPPED; } - + public static final GenomeLoc WHOLE_GENOME = new GenomeLoc("all",-1,0,0); + // -------------------------------------------------------------------------------------------------------------- // // constructors @@ -68,6 +69,8 @@ public class GenomeLoc implements Comparable, Cloneable, Serializable // // Accessors and setters // + public final GenomeLoc getLocation() { return this; } + public final String getContig() { return this.contigName; } diff --git a/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java b/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java index 85156abf1..c2d660b97 100644 --- a/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java +++ b/java/src/org/broadinstitute/sting/utils/pileup/AbstractReadBackedPileup.java @@ -25,6 +25,7 @@ package org.broadinstitute.sting.utils.pileup; import org.broadinstitute.sting.gatk.datasources.sample.Sample; +import org.broadinstitute.sting.utils.HasGenomeLocation; import org.broadinstitute.sting.utils.exceptions.ReviewedStingException; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.utils.BaseUtils; diff --git a/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java b/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java index f4232e7fd..c53476747 100644 --- a/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java +++ b/java/src/org/broadinstitute/sting/utils/pileup/ReadBackedPileup.java @@ -28,6 +28,7 @@ import org.broadinstitute.sting.gatk.datasources.sample.Sample; import org.broadinstitute.sting.utils.GenomeLoc; import org.broadinstitute.sting.gatk.iterators.IterableIterator; import net.sf.samtools.SAMRecord; +import org.broadinstitute.sting.utils.HasGenomeLocation; import java.util.List; import java.util.Collection; @@ -38,7 +39,7 @@ import java.util.Collection; * @author mhanna * @version 0.1 */ -public interface ReadBackedPileup extends Iterable { +public interface ReadBackedPileup extends Iterable, HasGenomeLocation { /** * Returns a new ReadBackedPileup that is free of deletion spanning reads in this pileup. Note that this * does not copy the data, so both ReadBackedPileups should not be changed. Doesn't make an unnecessary copy diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java b/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java index aa8ac5377..d3c92c254 100755 --- a/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/providers/LocusViewTemplate.java @@ -52,7 +52,7 @@ public abstract class LocusViewTemplate extends BaseTest { SAMRecordIterator iterator = new SAMRecordIterator(); GenomeLoc shardBounds = genomeLocParser.createGenomeLoc("chr1", 1, 5); - Shard shard = new LocusShard(new SAMDataSource(Collections.emptyList(),genomeLocParser),Collections.singletonList(shardBounds),Collections.emptyMap()); + Shard shard = new LocusShard(genomeLocParser, new SAMDataSource(Collections.emptyList(),genomeLocParser),Collections.singletonList(shardBounds),Collections.emptyMap()); WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(),LocusIteratorByState.NO_FILTERS, new SampleDataSource()); WindowMaker.WindowMakerIterator window = windowMaker.next(); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, null, genomeLocParser, window.getLocus(), window, null, null); diff --git a/java/test/org/broadinstitute/sting/gatk/datasources/shards/MockLocusShard.java b/java/test/org/broadinstitute/sting/gatk/datasources/shards/MockLocusShard.java index 8b65f0900..db2aaf7b5 100644 --- a/java/test/org/broadinstitute/sting/gatk/datasources/shards/MockLocusShard.java +++ b/java/test/org/broadinstitute/sting/gatk/datasources/shards/MockLocusShard.java @@ -40,6 +40,9 @@ import java.util.Collections; */ public class MockLocusShard extends LocusShard { public MockLocusShard(final GenomeLocParser genomeLocParser,final List intervals) { - super(new SAMDataSource(Collections.emptyList(),genomeLocParser),intervals,null); + super( genomeLocParser, + new SAMDataSource(Collections.emptyList(),genomeLocParser), + intervals, + null); } } diff --git a/scala/qscript/oneoffs/depristo/distributedGATKPerformance.scala b/scala/qscript/oneoffs/depristo/distributedGATKPerformance.scala new file mode 100755 index 000000000..177aa375c --- /dev/null +++ b/scala/qscript/oneoffs/depristo/distributedGATKPerformance.scala @@ -0,0 +1,153 @@ +import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction +import org.broadinstitute.sting.queue.extensions.gatk._ +import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction +import org.broadinstitute.sting.queue.QScript +import org.apache.commons.io.FilenameUtils; + +class DistributedGATKPerformance extends QScript { + qscript => + + @Argument(shortName="gatk", doc="gatk jar file", required=true) + var gatkJarFile: File = _ + + @Argument(shortName="outputDir", doc="output directory", required=false) + var outputDir: String = "" + + @Argument(shortName="dataset", doc="selects the datasets to run. If not provided, all datasets will be used", required=false) + var datasets: List[String] = Nil + + @Argument(shortName="long", doc="runs long calculations", required=false) + var long: Boolean = false + + + //@Argument(shortName="noBAQ", doc="turns off BAQ calculation", required=false) + var noBAQ: Boolean = false + + trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { logging_level = "INFO"; jarFile = gatkJarFile; memoryLimit = Some(3); } + + class Target( + val baseName: String, + val reference: File, + val dbsnpFile: String, + val hapmapFile: String, + val maskFile: String, + val bamList: File, + val goldStandard_VCF: File, + val intervals: String, + val titvTarget: Double, + val isLowpass: Boolean) { + val name = qscript.outputDir + baseName + val clusterFile = new File(name + ".clusters") + def rawVCF(part: String) = new File(name + "." + part + ".raw.vcf") + val filteredVCF = new File(name + ".filtered.vcf") + val titvRecalibratedVCF = new File(name + ".titv.recalibrated.vcf") + val tsRecalibratedVCF = new File(name + ".ts.recalibrated.vcf") + val goldStandardName = qscript.outputDir + "goldStandard/" + baseName + val goldStandardClusterFile = new File(goldStandardName + ".clusters") + } + + val hg18 = new File("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta") + val b36 = new File("/humgen/1kg/reference/human_b36_both.fasta") + val b37 = new File("/humgen/1kg/reference/human_g1k_v37.fasta") + val dbSNP_hg18 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_130_hg18.rod" + val dbSNP_b36 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_130_b36.rod" + val dbSNP_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_132_b37.leftAligned.vcf" + val hapmap_hg18 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.hg18_fwd.vcf" + val hapmap_b36 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b36_fwd.vcf" + val hapmap_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf" + val indelMask_b36 = "/humgen/1kg/processing/pipeline_test_bams/pilot1.dindel.mask.b36.bed" + val indelMask_b37 = "/humgen/1kg/processing/pipeline_test_bams/pilot1.dindel.mask.b37.bed" + + // ToDos: + // reduce the scope of the datasets so the script is more nimble + // figure out how to give names to all the Queue-LSF logs (other than Q-1931@node1434-24.out) so that it is easier to find logs for certain steps + // create gold standard BAQ'd bam files, no reason to always do it on the fly + + // Analysis to add at the end of the script: + // auto generation of the cluster plots + // spike in NA12878 to the exomes and to the lowpass, analysis of how much of her variants are being recovered compared to single sample exome or HiSeq calls + // produce Kiran's Venn plots based on comparison between new VCF and gold standard produced VCF + + val lowPass: Boolean = true + + val targetDataSets: Map[String, Target] = Map( + "HiSeq" -> new Target("NA12878.HiSeq", hg18, dbSNP_hg18, hapmap_hg18, + "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.indels.10.mask", + new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam"), + new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.vcf"), + "chr1", 2.07, !lowPass), + "FIN" -> new Target("FIN", b37, dbSNP_b37, hapmap_b37, indelMask_b37, + new File("/humgen/1kg/processing/pipeline_test_bams/FIN.79sample.Nov2010.chr20.bam"), + new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, lowPass), + "WEx" -> new Target("NA12878.WEx", hg18, dbSNP_hg18, hapmap_hg18, + "/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.indels.10.mask", + new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.WEx.cleaned.recal.bam"), + new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.ug.snpfiltered.indelfiltered.vcf"), + "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.targets.interval_list", 2.6, !lowPass), + "TGPWExGdA" -> new Target("1000G.WEx.GdA", b37, dbSNP_b37, hapmap_b37, indelMask_b37, + new File("/humgen/1kg/processing/pipeline_test_bams/Barcoded_1000G_WEx_Reduced_Plate_1.cleaned.list"), // BUGBUG: reduce from 60 to 20 people + new File("/humgen/gsa-scr1/delangel/NewUG/calls/AugustRelease.filtered_Q50_QD5.0_SB0.0.allSamples.SNPs_hg19.WEx_UG_newUG_MQC.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** + "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 2.6, !lowPass), + "LowPassN60" -> new Target("lowpass.N60", b36, dbSNP_b36, hapmap_b36, indelMask_b36, + new File("/humgen/1kg/analysis/bamsForDataProcessingPapers/lowpass_b36/lowpass.chr20.cleaned.matefixed.bam"), // the bam list to call from + new File("/home/radon01/depristo/work/oneOffProjects/VQSRCutByNRS/lowpass.N60.chr20.filtered.vcf"), // the gold standard VCF file to run through the VQSR + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.b36.intervals", 2.3, lowPass), // chunked interval list to use with Queue's scatter/gather functionality + "LowPassAugust" -> new Target("ALL.august.v4", b37, dbSNP_b37, hapmap_b37, indelMask_b37, // BUGBUG: kill this, it is too large + new File("/humgen/1kg/processing/allPopulations_chr20_august_release.cleaned.merged.bams/ALL.cleaned.merged.list"), + new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, lowPass), + "LowPassEUR363Nov" -> new Target("EUR.nov2010", b37, dbSNP_b37, hapmap_b37, indelMask_b37, + new File("/humgen/1kg/processing/pipeline_test_bams/EUR.363sample.Nov2010.chr20.bam"), + new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** + "/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, lowPass), + "WExTrio" -> new Target("NA12878Trio.WEx", b37, dbSNP_b37, hapmap_b37, indelMask_b37, + new File("/humgen/gsa-scr1/carneiro/prj/trio/NA12878Trio.WEx.hg19.bam"), + new File("/humgen/gsa-scr1/delangel/NewUG/calls/AugustRelease.filtered_Q50_QD5.0_SB0.0.allSamples.SNPs_hg19.WEx_UG_newUG_MQC.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED ** + "/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 2.6, !lowPass) + ) + + def script = { + + // Selects the datasets in the -dataset argument and adds them to targets. + var targets: List[Target] = List() + if (!datasets.isEmpty) + for (ds <- datasets) + targets ::= targetDataSets(ds) // Could check if ds was mispelled, but this way an exception will be thrown, maybe it's better this way? + else // If -dataset is not specified, all datasets are used. + for (targetDS <- targetDataSets.valuesIterator) // for Scala 2.7 or older, use targetDataSets.values + targets ::= targetDS + + val nWays = if (long) List(1, 2, 5, 10) else List(25, 50, 100) + + for (target <- targets) { + for (nWaysParallel <- nWays) { +// for (nWaysParallel <- List(2, 5)) { + val aname = "distN" + nWaysParallel; + val coordinationFile = new File(target.name + "." + aname + ".distributed.txt") + for ( part <- 1 to nWaysParallel) { + add(new UnifiedGenotyper(target, coordinationFile, part, aname + ".part" + part)) + } + } + } + } + + // 1.) Call SNPs with UG + class UnifiedGenotyper(t: Target, coordinationFile: File, i: Int, part: String) extends org.broadinstitute.sting.queue.extensions.gatk.UnifiedGenotyper with UNIVERSAL_GATK_ARGS { + this.reference_sequence = t.reference + this.processingTracker = coordinationFile + this.intervalsString ++= List(t.intervals) + this.dcov = Some( if ( t.isLowpass ) { 50 } else { 250 } ) + this.stand_call_conf = Some( if ( t.isLowpass ) { 4.0 } else { 30.0 } ) + this.stand_emit_conf = Some( if ( t.isLowpass ) { 4.0 } else { 30.0 } ) + this.input_file :+= t.bamList + this.out = t.rawVCF(part) + this.baq = Some( if (noBAQ) {org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.OFF} else {org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.RECALCULATE}) + this.analysisName = t.name + "_UG." + part + if ( i == 1 ) this.performanceLog = new File(coordinationFile.getAbsolutePath + "." + part + ".pf.log") + if (t.dbsnpFile.endsWith(".rod")) + this.DBSNP = new File(t.dbsnpFile) + else if (t.dbsnpFile.endsWith(".vcf")) + this.rodBind :+= RodBind("dbsnp", "VCF", t.dbsnpFile) + } +}