Added implements HasGenomeLocation to all revelant classes. It's not possible to write generic code for working with objects that support the getLocation() function in HasGenomeLocation. Please, if you have an object that has a location, implement this interface and start using / writing generic functions to sort, compare, etc. these objects.

git-svn-id: file:///humgen/gsa-scr1/gsa-engineering/svn_contents/trunk@5031 348d0f76-0448-11de-a6fe-93d51630548a
This commit is contained in:
depristo 2011-01-20 12:54:03 +00:00
parent cacdac3914
commit 41c8552d0a
18 changed files with 187 additions and 17 deletions

View File

@ -26,6 +26,7 @@
package org.broadinstitute.sting.gatk.contexts;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.HasGenomeLocation;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.pileup.ReadBackedPileup;
@ -42,7 +43,7 @@ import java.util.*;
* Time: 3:01:34 PM
* To change this template use File | Settings | File Templates.
*/
public class AlignmentContext {
public class AlignmentContext implements HasGenomeLocation {
protected GenomeLoc loc = null;
protected ReadBackedPileup basePileup = null;
protected boolean hasPileupBeenDownsampled;

View File

@ -27,6 +27,7 @@ package org.broadinstitute.sting.gatk.contexts;
import org.broadinstitute.sting.gatk.GenomeAnalysisEngine;
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
import org.broadinstitute.sting.utils.HasGenomeLocation;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.exceptions.UserException;
@ -39,7 +40,7 @@ import java.util.*;
* User: ebanks
* Modified: chartl (split by read group)
*/
public class StratifiedAlignmentContext<RBP extends ReadBackedPileup> {
public class StratifiedAlignmentContext<RBP extends ReadBackedPileup> implements HasGenomeLocation {
// Definitions:
// COMPLETE = full alignment context

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.gatk.datasources.simpleDataSources;
import net.sf.samtools.SAMSequenceDictionary;
import org.broadinstitute.sting.utils.HasGenomeLocation;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
@ -189,7 +190,7 @@ class EntireStream implements DataStreamSegment {
/**
* Models a mapped position within a stream of GATK input data.
*/
class MappedStreamSegment implements DataStreamSegment {
class MappedStreamSegment implements DataStreamSegment, HasGenomeLocation {
public final GenomeLoc locus;
/**

View File

@ -2,6 +2,7 @@ package org.broadinstitute.sting.gatk.refdata;
import org.broadinstitute.sting.gatk.refdata.utils.GATKFeature;
import org.broadinstitute.sting.gatk.refdata.utils.RODRecordList;
import org.broadinstitute.sting.utils.HasGenomeLocation;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.GenomeLoc;
@ -14,7 +15,7 @@ import java.util.*;
* Time: 6:10:48 PM
* To change this template use File | Settings | File Templates.
*/
public class RODRecordListImpl extends AbstractList<GATKFeature> implements Comparable<RODRecordList>, Cloneable, RODRecordList {
public class RODRecordListImpl extends AbstractList<GATKFeature> implements Comparable<RODRecordList>, Cloneable, RODRecordList, HasGenomeLocation {
private List<GATKFeature> records;
private GenomeLoc location = null;
private String name = null;

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.gatk.refdata;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.HasGenomeLocation;
import java.io.File;
import java.io.FileNotFoundException;
@ -13,7 +14,7 @@ import java.io.IOException;
* Time: 10:49:47 AM
* To change this template use File | Settings | File Templates.
*/
public interface ReferenceOrderedDatum extends Comparable<ReferenceOrderedDatum> {
public interface ReferenceOrderedDatum extends Comparable<ReferenceOrderedDatum>, HasGenomeLocation {
public String getName();
public boolean parseLine(final Object header, final String[] parts) throws IOException;
public String toString();

View File

@ -1,6 +1,7 @@
package org.broadinstitute.sting.gatk.refdata;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.HasGenomeLocation;
import java.util.List;
@ -11,7 +12,7 @@ import java.util.List;
* Time: 5:22:30 PM
* To change this template use File | Settings | File Templates.
*/
public interface Transcript {
public interface Transcript extends HasGenomeLocation {
/** Returns id of the transcript (RefSeq NM_* id) */
public String getTranscriptId();

View File

@ -25,6 +25,7 @@ package org.broadinstitute.sting.gatk.refdata.utils;
import net.sf.samtools.SAMSequenceDictionary;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.HasGenomeLocation;
import java.util.Comparator;
import java.util.LinkedList;
@ -190,7 +191,7 @@ public class FlashBackIterator implements LocationAwareSeekableRODIterator {
/**
* a list that buffers the location for this rod
*/
class ComparableList implements Comparator<ComparableList> {
class ComparableList implements Comparator<ComparableList>, HasGenomeLocation {
private RODRecordList list;
private GenomeLoc location = null;
public ComparableList(RODRecordList list) {

View File

@ -27,6 +27,7 @@ import org.broad.tribble.Feature;
import org.broadinstitute.sting.gatk.refdata.ReferenceOrderedDatum;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.GenomeLocParser;
import org.broadinstitute.sting.utils.HasGenomeLocation;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.exceptions.UserException;
@ -40,7 +41,7 @@ import org.broadinstitute.sting.utils.exceptions.UserException;
* This wraps a Tribble feature or a RODatum so that both present the same interface: a genome loc for position and a
* way of retrieving the track name.
*/
public abstract class GATKFeature implements Feature {
public abstract class GATKFeature implements Feature, HasGenomeLocation {
public GATKFeature(String name) {
this.name = name;

View File

@ -24,6 +24,7 @@
package org.broadinstitute.sting.gatk.refdata.utils;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.HasGenomeLocation;
import java.util.List;
@ -36,7 +37,7 @@ import java.util.List;
* make the RODRecord list an interface, so we can stub in other implementations
* during testing.
*/
public interface RODRecordList extends List<GATKFeature>, Comparable<RODRecordList> {
public interface RODRecordList extends List<GATKFeature>, Comparable<RODRecordList>, HasGenomeLocation {
public GenomeLoc getLocation();
public String getName();
}

View File

@ -99,7 +99,7 @@ public class CallableLociWalker extends LocusWalker<CallableLociWalker.CallableB
CallableBaseState state = null;
}
public static class CallableBaseState {
public static class CallableBaseState implements HasGenomeLocation {
public GenomeLocParser genomeLocParser;
public GenomeLoc loc;
public CalledState state;

View File

@ -1494,7 +1494,7 @@ public class IndelRealigner extends ReadWalker<Integer, Integer> {
}
}
private class ReadBin {
private class ReadBin implements HasGenomeLocation {
private final ArrayList<SAMRecord> reads = new ArrayList<SAMRecord>();
private byte[] reference = null;

View File

@ -1086,7 +1086,7 @@ public class ReadBackedPhasingWalker extends RodWalker<PhasingStatsAndOutput, Ph
// COULD replace with MutableVariantContext if it worked [didn't throw exceptions when trying to call its set() methods]...
private class UnfinishedVariantContext {
private class UnfinishedVariantContext implements HasGenomeLocation {
private String name;
private String contig;
private int start;

View File

@ -17,7 +17,7 @@ import java.io.Serializable;
*
*
*/
public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable {
public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable, HasGenomeLocation {
/**
* the basic components of a genome loc, its contig index,
* start and stop position, and (optionally) the contig name
@ -38,7 +38,8 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
public static final boolean isUnmapped(GenomeLoc loc) {
return loc == UNMAPPED;
}
public static final GenomeLoc WHOLE_GENOME = new GenomeLoc("all",-1,0,0);
// --------------------------------------------------------------------------------------------------------------
//
// constructors
@ -68,6 +69,8 @@ public class GenomeLoc implements Comparable<GenomeLoc>, Cloneable, Serializable
//
// Accessors and setters
//
public final GenomeLoc getLocation() { return this; }
public final String getContig() {
return this.contigName;
}

View File

@ -25,6 +25,7 @@
package org.broadinstitute.sting.utils.pileup;
import org.broadinstitute.sting.gatk.datasources.sample.Sample;
import org.broadinstitute.sting.utils.HasGenomeLocation;
import org.broadinstitute.sting.utils.exceptions.ReviewedStingException;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.utils.BaseUtils;

View File

@ -28,6 +28,7 @@ import org.broadinstitute.sting.gatk.datasources.sample.Sample;
import org.broadinstitute.sting.utils.GenomeLoc;
import org.broadinstitute.sting.gatk.iterators.IterableIterator;
import net.sf.samtools.SAMRecord;
import org.broadinstitute.sting.utils.HasGenomeLocation;
import java.util.List;
import java.util.Collection;
@ -38,7 +39,7 @@ import java.util.Collection;
* @author mhanna
* @version 0.1
*/
public interface ReadBackedPileup extends Iterable<PileupElement> {
public interface ReadBackedPileup extends Iterable<PileupElement>, HasGenomeLocation {
/**
* Returns a new ReadBackedPileup that is free of deletion spanning reads in this pileup. Note that this
* does not copy the data, so both ReadBackedPileups should not be changed. Doesn't make an unnecessary copy

View File

@ -52,7 +52,7 @@ public abstract class LocusViewTemplate extends BaseTest {
SAMRecordIterator iterator = new SAMRecordIterator();
GenomeLoc shardBounds = genomeLocParser.createGenomeLoc("chr1", 1, 5);
Shard shard = new LocusShard(new SAMDataSource(Collections.<SAMReaderID>emptyList(),genomeLocParser),Collections.singletonList(shardBounds),Collections.<SAMReaderID,SAMFileSpan>emptyMap());
Shard shard = new LocusShard(genomeLocParser, new SAMDataSource(Collections.<SAMReaderID>emptyList(),genomeLocParser),Collections.singletonList(shardBounds),Collections.<SAMReaderID,SAMFileSpan>emptyMap());
WindowMaker windowMaker = new WindowMaker(shard,genomeLocParser,iterator,shard.getGenomeLocs(),LocusIteratorByState.NO_FILTERS, new SampleDataSource());
WindowMaker.WindowMakerIterator window = windowMaker.next();
LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, null, genomeLocParser, window.getLocus(), window, null, null);

View File

@ -40,6 +40,9 @@ import java.util.Collections;
*/
public class MockLocusShard extends LocusShard {
public MockLocusShard(final GenomeLocParser genomeLocParser,final List<GenomeLoc> intervals) {
super(new SAMDataSource(Collections.<SAMReaderID>emptyList(),genomeLocParser),intervals,null);
super( genomeLocParser,
new SAMDataSource(Collections.<SAMReaderID>emptyList(),genomeLocParser),
intervals,
null);
}
}

View File

@ -0,0 +1,153 @@
import org.broadinstitute.sting.queue.extensions.picard.PicardBamJarFunction
import org.broadinstitute.sting.queue.extensions.gatk._
import org.broadinstitute.sting.queue.extensions.samtools.SamtoolsIndexFunction
import org.broadinstitute.sting.queue.QScript
import org.apache.commons.io.FilenameUtils;
class DistributedGATKPerformance extends QScript {
qscript =>
@Argument(shortName="gatk", doc="gatk jar file", required=true)
var gatkJarFile: File = _
@Argument(shortName="outputDir", doc="output directory", required=false)
var outputDir: String = ""
@Argument(shortName="dataset", doc="selects the datasets to run. If not provided, all datasets will be used", required=false)
var datasets: List[String] = Nil
@Argument(shortName="long", doc="runs long calculations", required=false)
var long: Boolean = false
//@Argument(shortName="noBAQ", doc="turns off BAQ calculation", required=false)
var noBAQ: Boolean = false
trait UNIVERSAL_GATK_ARGS extends CommandLineGATK { logging_level = "INFO"; jarFile = gatkJarFile; memoryLimit = Some(3); }
class Target(
val baseName: String,
val reference: File,
val dbsnpFile: String,
val hapmapFile: String,
val maskFile: String,
val bamList: File,
val goldStandard_VCF: File,
val intervals: String,
val titvTarget: Double,
val isLowpass: Boolean) {
val name = qscript.outputDir + baseName
val clusterFile = new File(name + ".clusters")
def rawVCF(part: String) = new File(name + "." + part + ".raw.vcf")
val filteredVCF = new File(name + ".filtered.vcf")
val titvRecalibratedVCF = new File(name + ".titv.recalibrated.vcf")
val tsRecalibratedVCF = new File(name + ".ts.recalibrated.vcf")
val goldStandardName = qscript.outputDir + "goldStandard/" + baseName
val goldStandardClusterFile = new File(goldStandardName + ".clusters")
}
val hg18 = new File("/seq/references/Homo_sapiens_assembly18/v0/Homo_sapiens_assembly18.fasta")
val b36 = new File("/humgen/1kg/reference/human_b36_both.fasta")
val b37 = new File("/humgen/1kg/reference/human_g1k_v37.fasta")
val dbSNP_hg18 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_130_hg18.rod"
val dbSNP_b36 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_130_b36.rod"
val dbSNP_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/dbSNP/dbsnp_132_b37.leftAligned.vcf"
val hapmap_hg18 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.hg18_fwd.vcf"
val hapmap_b36 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b36_fwd.vcf"
val hapmap_b37 = "/humgen/gsa-hpprojects/GATK/data/Comparisons/Validated/HapMap/3.3/sites_r27_nr.b37_fwd.vcf"
val indelMask_b36 = "/humgen/1kg/processing/pipeline_test_bams/pilot1.dindel.mask.b36.bed"
val indelMask_b37 = "/humgen/1kg/processing/pipeline_test_bams/pilot1.dindel.mask.b37.bed"
// ToDos:
// reduce the scope of the datasets so the script is more nimble
// figure out how to give names to all the Queue-LSF logs (other than Q-1931@node1434-24.out) so that it is easier to find logs for certain steps
// create gold standard BAQ'd bam files, no reason to always do it on the fly
// Analysis to add at the end of the script:
// auto generation of the cluster plots
// spike in NA12878 to the exomes and to the lowpass, analysis of how much of her variants are being recovered compared to single sample exome or HiSeq calls
// produce Kiran's Venn plots based on comparison between new VCF and gold standard produced VCF
val lowPass: Boolean = true
val targetDataSets: Map[String, Target] = Map(
"HiSeq" -> new Target("NA12878.HiSeq", hg18, dbSNP_hg18, hapmap_hg18,
"/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.indels.10.mask",
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.HiSeq.WGS.bwa.cleaned.recal.bam"),
new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/HiSeq.WGS.cleaned.ug.snpfiltered.indelfiltered.vcf"),
"chr1", 2.07, !lowPass),
"FIN" -> new Target("FIN", b37, dbSNP_b37, hapmap_b37, indelMask_b37,
new File("/humgen/1kg/processing/pipeline_test_bams/FIN.79sample.Nov2010.chr20.bam"),
new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, lowPass),
"WEx" -> new Target("NA12878.WEx", hg18, dbSNP_hg18, hapmap_hg18,
"/humgen/gsa-hpprojects/dev/depristo/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.indels.10.mask",
new File("/humgen/gsa-hpprojects/NA12878Collection/bams/NA12878.WEx.cleaned.recal.bam"),
new File("/home/radon01/depristo/work/oneOffProjects/1000GenomesProcessingPaper/wgs.v13/GA2.WEx.cleaned.ug.snpfiltered.indelfiltered.vcf"),
"/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.targets.interval_list", 2.6, !lowPass),
"TGPWExGdA" -> new Target("1000G.WEx.GdA", b37, dbSNP_b37, hapmap_b37, indelMask_b37,
new File("/humgen/1kg/processing/pipeline_test_bams/Barcoded_1000G_WEx_Reduced_Plate_1.cleaned.list"), // BUGBUG: reduce from 60 to 20 people
new File("/humgen/gsa-scr1/delangel/NewUG/calls/AugustRelease.filtered_Q50_QD5.0_SB0.0.allSamples.SNPs_hg19.WEx_UG_newUG_MQC.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
"/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 2.6, !lowPass),
"LowPassN60" -> new Target("lowpass.N60", b36, dbSNP_b36, hapmap_b36, indelMask_b36,
new File("/humgen/1kg/analysis/bamsForDataProcessingPapers/lowpass_b36/lowpass.chr20.cleaned.matefixed.bam"), // the bam list to call from
new File("/home/radon01/depristo/work/oneOffProjects/VQSRCutByNRS/lowpass.N60.chr20.filtered.vcf"), // the gold standard VCF file to run through the VQSR
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.b36.intervals", 2.3, lowPass), // chunked interval list to use with Queue's scatter/gather functionality
"LowPassAugust" -> new Target("ALL.august.v4", b37, dbSNP_b37, hapmap_b37, indelMask_b37, // BUGBUG: kill this, it is too large
new File("/humgen/1kg/processing/allPopulations_chr20_august_release.cleaned.merged.bams/ALL.cleaned.merged.list"),
new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"),
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, lowPass),
"LowPassEUR363Nov" -> new Target("EUR.nov2010", b37, dbSNP_b37, hapmap_b37, indelMask_b37,
new File("/humgen/1kg/processing/pipeline_test_bams/EUR.363sample.Nov2010.chr20.bam"),
new File("/humgen/gsa-hpprojects/dev/data/AugChr20Calls_v4_3state/ALL.august.v4.chr20.filtered.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
"/humgen/1kg/processing/pipeline_test_bams/whole_genome_chunked.chr20.hg19.intervals", 2.3, lowPass),
"WExTrio" -> new Target("NA12878Trio.WEx", b37, dbSNP_b37, hapmap_b37, indelMask_b37,
new File("/humgen/gsa-scr1/carneiro/prj/trio/NA12878Trio.WEx.hg19.bam"),
new File("/humgen/gsa-scr1/delangel/NewUG/calls/AugustRelease.filtered_Q50_QD5.0_SB0.0.allSamples.SNPs_hg19.WEx_UG_newUG_MQC.vcf"), // ** THIS GOLD STANDARD NEEDS TO BE CORRECTED **
"/seq/references/HybSelOligos/whole_exome_agilent_1.1_refseq_plus_3_boosters/whole_exome_agilent_1.1_refseq_plus_3_boosters.Homo_sapiens_assembly19.targets.interval_list", 2.6, !lowPass)
)
def script = {
// Selects the datasets in the -dataset argument and adds them to targets.
var targets: List[Target] = List()
if (!datasets.isEmpty)
for (ds <- datasets)
targets ::= targetDataSets(ds) // Could check if ds was mispelled, but this way an exception will be thrown, maybe it's better this way?
else // If -dataset is not specified, all datasets are used.
for (targetDS <- targetDataSets.valuesIterator) // for Scala 2.7 or older, use targetDataSets.values
targets ::= targetDS
val nWays = if (long) List(1, 2, 5, 10) else List(25, 50, 100)
for (target <- targets) {
for (nWaysParallel <- nWays) {
// for (nWaysParallel <- List(2, 5)) {
val aname = "distN" + nWaysParallel;
val coordinationFile = new File(target.name + "." + aname + ".distributed.txt")
for ( part <- 1 to nWaysParallel) {
add(new UnifiedGenotyper(target, coordinationFile, part, aname + ".part" + part))
}
}
}
}
// 1.) Call SNPs with UG
class UnifiedGenotyper(t: Target, coordinationFile: File, i: Int, part: String) extends org.broadinstitute.sting.queue.extensions.gatk.UnifiedGenotyper with UNIVERSAL_GATK_ARGS {
this.reference_sequence = t.reference
this.processingTracker = coordinationFile
this.intervalsString ++= List(t.intervals)
this.dcov = Some( if ( t.isLowpass ) { 50 } else { 250 } )
this.stand_call_conf = Some( if ( t.isLowpass ) { 4.0 } else { 30.0 } )
this.stand_emit_conf = Some( if ( t.isLowpass ) { 4.0 } else { 30.0 } )
this.input_file :+= t.bamList
this.out = t.rawVCF(part)
this.baq = Some( if (noBAQ) {org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.OFF} else {org.broadinstitute.sting.utils.baq.BAQ.CalculationMode.RECALCULATE})
this.analysisName = t.name + "_UG." + part
if ( i == 1 ) this.performanceLog = new File(coordinationFile.getAbsolutePath + "." + part + ".pf.log")
if (t.dbsnpFile.endsWith(".rod"))
this.DBSNP = new File(t.dbsnpFile)
else if (t.dbsnpFile.endsWith(".vcf"))
this.rodBind :+= RodBind("dbsnp", "VCF", t.dbsnpFile)
}
}